import numpy as np
import pandas as pd
import random
pd.set_option('display.max_columns',None)
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score, roc_curve, auc
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
seed_value = 1234
#Carreguem el Train set
train_df = pd.read_csv('C:/Users/Ramon/Documents/TFM/dataset/aps_failure_training_set_processed_8bit.csv', na_values=["NaN"]).sample(frac=1, random_state=seed_value)
train_df.head()
| class | aa_000 | ab_000 | ac_000 | ad_000 | ae_000 | af_000 | ag_000 | ag_001 | ag_002 | ag_003 | ag_004 | ag_005 | ag_006 | ag_007 | ag_008 | ag_009 | ah_000 | ai_000 | aj_000 | ak_000 | al_000 | am_0 | an_000 | ao_000 | ap_000 | aq_000 | ar_000 | as_000 | at_000 | au_000 | av_000 | ax_000 | ay_000 | ay_001 | ay_002 | ay_003 | ay_004 | ay_005 | ay_006 | ay_007 | ay_008 | ay_009 | az_000 | az_001 | az_002 | az_003 | az_004 | az_005 | az_006 | az_007 | az_008 | az_009 | ba_000 | ba_001 | ba_002 | ba_003 | ba_004 | ba_005 | ba_006 | ba_007 | ba_008 | ba_009 | bb_000 | bc_000 | bd_000 | be_000 | bf_000 | bg_000 | bh_000 | bi_000 | bj_000 | bk_000 | bl_000 | bm_000 | bn_000 | bo_000 | bp_000 | bq_000 | br_000 | bs_000 | bt_000 | bu_000 | bv_000 | bx_000 | by_000 | bz_000 | ca_000 | cb_000 | cc_000 | cd_000 | ce_000 | cf_000 | cg_000 | ch_000 | ci_000 | cj_000 | ck_000 | cl_000 | cm_000 | cn_000 | cn_001 | cn_002 | cn_003 | cn_004 | cn_005 | cn_006 | cn_007 | cn_008 | cn_009 | co_000 | cp_000 | cq_000 | cr_000 | cs_000 | cs_001 | cs_002 | cs_003 | cs_004 | cs_005 | cs_006 | cs_007 | cs_008 | cs_009 | ct_000 | cu_000 | cv_000 | cx_000 | cy_000 | cz_000 | da_000 | db_000 | dc_000 | dd_000 | de_000 | df_000 | dg_000 | dh_000 | di_000 | dj_000 | dk_000 | dl_000 | dm_000 | dn_000 | do_000 | dp_000 | dq_000 | dr_000 | ds_000 | dt_000 | du_000 | dv_000 | dx_000 | dy_000 | dz_000 | ea_000 | eb_000 | ec_00 | ed_000 | ee_000 | ee_001 | ee_002 | ee_003 | ee_004 | ee_005 | ee_006 | ee_007 | ee_008 | ee_009 | ef_000 | eg_000 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 30329 | -0.992188 | -0.125000 | 0.992188 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.03125 | -0.054688 | -0.117188 | -0.140625 | -0.164062 | 0.765625 | -0.023438 | -0.132812 | -0.031250 | 0.109375 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | 0.226562 | 0.242188 | -0.015625 | -0.000000 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | 0.992188 | 0.992188 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.070312 | -0.218750 | -0.281250 | 0.992188 | -0.015625 | -0.046875 | -0.054688 | -0.031250 | -0.117188 | -0.093750 | 0.398438 | -0.109375 | -0.0625 | -0.046875 | -0.015625 | 0.625000 | 0.210938 | -0.085938 | -0.171875 | -0.234375 | -0.281250 | -0.273438 | -0.273438 | -0.140625 | -0.109375 | 0.148438 | -0.070312 | -0.140625 | 0.132812 | -0.031250 | 0.109375 | 0.203125 | -0.140625 | 0.085938 | -0.500000 | -0.570312 | -0.484375 | -0.500000 | -0.539062 | -0.812500 | -0.796875 | -0.796875 | -0.632812 | -0.125000 | 0.148438 | 0.148438 | 0.203125 | -0.148438 | -0.148438 | -0.835938 | -0.734375 | 0.210938 | 0.0 | 0.992188 | -0.007812 | 0.960938 | -0.015625 | 0.054688 | -0.09375 | 0.531250 | -0.109375 | 0.429688 | -0.039062 | -0.085938 | -0.117188 | -0.140625 | 0.109375 | 0.703125 | -0.046875 | -0.093750 | -0.093750 | -0.03125 | -0.007812 | -0.070312 | 0.148438 | -0.085938 | 0.312500 | 0.070312 | -0.132812 | 0.992188 | 0.062500 | 0.007812 | 0.351562 | 0.015625 | -0.000000 | -0.007812 | 0.992188 | 0.992188 | 0.343750 | -0.195312 | -0.046875 | -0.09375 | -0.039062 | -0.226562 | 0.312500 | 0.390625 | 0.039062 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | 0.062500 | -0.015625 | -0.062500 | -0.054688 | -0.164062 | 0.507812 | 0.937500 | -0.265625 | -0.226562 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.078125 | 0.140625 | 0.187500 | 0.507812 | 0.539062 | 0.203125 | 0.054688 | 0.007812 | -0.179688 | -0.257812 | -0.179688 | -0.250000 | -0.171875 | -0.023438 | -0.023438 |
| 44957 | -0.992188 | -0.398438 | 0.242188 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.03125 | -0.054688 | -0.117188 | -0.187500 | -0.335938 | -0.406250 | -0.343750 | -0.164062 | -0.031250 | -0.421875 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.429688 | -0.429688 | -0.320312 | -0.343750 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.171875 | -0.242188 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.078125 | -0.304688 | -0.296875 | -0.257812 | -0.015625 | -0.085938 | -0.125000 | -0.078125 | -0.132812 | -0.343750 | -0.320312 | -0.109375 | -0.0625 | -0.046875 | -0.015625 | -0.351562 | -0.367188 | -0.335938 | -0.359375 | -0.375000 | -0.367188 | -0.328125 | -0.351562 | -0.148438 | -0.109375 | -0.406250 | -0.156250 | -0.195312 | -0.156250 | -0.132812 | -0.421875 | -0.375000 | -0.328125 | -0.273438 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.695312 | -0.398438 | -0.406250 | -0.406250 | -0.390625 | -0.398438 | -0.171875 | -0.835938 | -0.828125 | -0.390625 | 0.0 | -0.429688 | -0.007812 | -0.226562 | -0.015625 | -0.406250 | -0.09375 | -0.312500 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.234375 | -0.367188 | -0.414062 | -0.312500 | -0.156250 | -0.101562 | -0.03125 | -0.007812 | -0.078125 | -0.406250 | -0.085938 | -0.382812 | -0.250000 | -0.195312 | -0.312500 | -0.210938 | -0.390625 | -0.414062 | -0.140625 | -0.015625 | -0.007812 | -0.140625 | -0.164062 | -0.507812 | -0.226562 | -0.046875 | -0.09375 | -0.039062 | -0.226562 | -0.515625 | -0.312500 | -0.179688 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.335938 | -0.437500 | -0.468750 | -0.054688 | -0.164062 | -0.414062 | -0.429688 | -0.351562 | -0.289062 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.234375 | -0.351562 | -0.390625 | -0.289062 | -0.296875 | -0.382812 | -0.382812 | -0.375000 | -0.320312 | -0.304688 | -0.195312 | -0.296875 | -0.164062 | -0.023438 | -0.023438 |
| 30866 | -0.992188 | -0.007812 | -0.289062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.03125 | -0.054688 | -0.117188 | -0.179688 | -0.085938 | 0.210938 | 0.164062 | 0.015625 | -0.015625 | -0.031250 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.000000 | -0.031250 | -0.015625 | 0.070312 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.039062 | -0.078125 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.078125 | 0.406250 | -0.117188 | -0.085938 | -0.015625 | 0.281250 | 0.601562 | 0.273438 | 0.648438 | 0.359375 | -0.250000 | -0.109375 | -0.0625 | -0.046875 | -0.015625 | 0.062500 | 0.101562 | -0.007812 | -0.046875 | -0.093750 | -0.140625 | -0.078125 | 0.070312 | -0.140625 | -0.109375 | -0.007812 | 0.070312 | -0.179688 | -0.093750 | -0.085938 | -0.031250 | -0.054688 | -0.046875 | 0.015625 | -0.476562 | -0.398438 | -0.210938 | -0.835938 | -0.820312 | -0.812500 | -0.796875 | -0.796875 | 0.632812 | -0.007812 | -0.007812 | -0.007812 | 0.023438 | -0.023438 | -0.164062 | 0.812500 | 0.992188 | 0.023438 | 0.0 | 0.218750 | -0.007812 | -0.132812 | -0.015625 | 0.000000 | -0.09375 | -0.062500 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.156250 | 0.007812 | 0.226562 | 0.054688 | -0.054688 | -0.062500 | -0.03125 | -0.007812 | -0.070312 | -0.007812 | -0.085938 | 0.296875 | 0.093750 | -0.117188 | -0.046875 | -0.046875 | 0.085938 | 0.046875 | 0.164062 | 0.000000 | -0.007812 | 0.000000 | -0.093750 | -0.468750 | -0.179688 | -0.046875 | 0.65625 | -0.039062 | -0.101562 | 0.195312 | -0.046875 | -0.148438 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.015625 | 0.992188 | 0.992188 | -0.054688 | -0.164062 | -0.101562 | -0.132812 | 0.992188 | 0.992188 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.210938 | 0.273438 | 0.265625 | -0.046875 | -0.062500 | -0.039062 | 0.421875 | 0.992188 | -0.234375 | -0.312500 | -0.195312 | -0.304688 | -0.171875 | -0.023438 | -0.023438 |
| 40447 | -0.992188 | -0.132812 | -0.289062 | 0.992188 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.03125 | -0.054688 | -0.117188 | -0.179688 | 0.015625 | -0.140625 | -0.281250 | -0.148438 | -0.031250 | -0.164062 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.171875 | -0.164062 | -0.132812 | -0.171875 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | 0.195312 | -0.023438 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.078125 | -0.328125 | -0.296875 | 0.320312 | -0.015625 | -0.062500 | -0.109375 | -0.070312 | -0.132812 | -0.351562 | 0.031250 | -0.109375 | -0.0625 | -0.046875 | -0.015625 | -0.226562 | -0.234375 | 0.054688 | 0.296875 | 0.171875 | -0.140625 | -0.265625 | -0.296875 | -0.140625 | -0.109375 | -0.164062 | -0.148438 | -0.187500 | -0.132812 | -0.023438 | -0.164062 | -0.226562 | -0.093750 | -0.140625 | -0.406250 | -0.359375 | -0.882812 | -0.835938 | -0.820312 | -0.812500 | -0.796875 | -0.796875 | -0.226562 | -0.132812 | -0.164062 | -0.164062 | -0.132812 | -0.101562 | -0.171875 | -0.296875 | -0.250000 | -0.156250 | 0.0 | -0.187500 | -0.007812 | -0.039062 | -0.015625 | -0.179688 | -0.06250 | -0.171875 | -0.109375 | 0.296875 | -0.039062 | -0.085938 | -0.148438 | -0.062500 | 0.101562 | -0.335938 | -0.289062 | -0.140625 | -0.101562 | -0.03125 | -0.007812 | -0.078125 | -0.164062 | -0.062500 | -0.039062 | -0.093750 | -0.187500 | -0.070312 | -0.179688 | -0.234375 | 0.437500 | 0.054688 | 0.007812 | -0.007812 | -0.093750 | -0.015625 | -0.046875 | -0.203125 | -0.046875 | -0.09375 | -0.039062 | 0.031250 | -0.109375 | -0.109375 | -0.125000 | -0.023438 | -0.03125 | -0.007812 | -0.031250 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.218750 | -0.429688 | -0.453125 | -0.054688 | -0.164062 | -0.304688 | -0.296875 | 0.312500 | 0.078125 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.242188 | -0.117188 | -0.117188 | -0.140625 | -0.210938 | -0.250000 | -0.242188 | -0.234375 | -0.132812 | 0.359375 | -0.007812 | -0.242188 | -0.171875 | -0.023438 | -0.023438 |
| 25580 | -0.992188 | -0.406250 | 0.015625 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.03125 | -0.054688 | -0.117188 | -0.187500 | -0.335938 | -0.421875 | -0.351562 | -0.164062 | -0.031250 | -0.429688 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.437500 | -0.437500 | -0.320312 | -0.343750 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.171875 | -0.257812 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.078125 | -0.328125 | -0.304688 | -0.257812 | -0.015625 | -0.085938 | -0.132812 | -0.078125 | -0.132812 | -0.351562 | -0.328125 | -0.109375 | -0.0625 | -0.046875 | -0.015625 | -0.367188 | -0.375000 | -0.343750 | -0.367188 | -0.375000 | -0.367188 | -0.328125 | -0.351562 | -0.148438 | -0.109375 | -0.414062 | -0.156250 | -0.195312 | -0.156250 | -0.132812 | -0.429688 | -0.375000 | -0.328125 | -0.281250 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.890625 | -0.406250 | -0.414062 | -0.414062 | -0.398438 | -0.406250 | -0.171875 | -0.992188 | -0.992188 | -0.398438 | 0.0 | -0.445312 | -0.007812 | -0.250000 | -0.015625 | -0.414062 | -0.09375 | -0.328125 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.234375 | -0.375000 | -0.421875 | -0.312500 | -0.156250 | -0.101562 | -0.03125 | -0.007812 | -0.078125 | -0.414062 | -0.085938 | -0.390625 | -0.265625 | -0.195312 | -0.320312 | -0.210938 | -0.398438 | -0.460938 | -0.179688 | -0.023438 | -0.007812 | -0.156250 | -0.171875 | -0.523438 | -0.242188 | -0.046875 | -0.09375 | -0.039062 | 0.156250 | -0.523438 | -0.320312 | -0.203125 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.343750 | -0.445312 | -0.476562 | -0.054688 | -0.164062 | -0.414062 | -0.437500 | -0.351562 | -0.289062 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.242188 | -0.398438 | -0.421875 | -0.296875 | -0.304688 | -0.382812 | -0.382812 | -0.382812 | -0.351562 | -0.312500 | -0.195312 | -0.304688 | -0.171875 | -0.023438 | -0.023438 |
#Carreguem el Test set
test_df = pd.read_csv('C:/Users/Ramon/Documents/TFM/dataset/aps_failure_test_set_processed_8bit.csv', na_values=["NaN"]).sample(frac=1, random_state=seed_value)
test_df.head()
| class | aa_000 | ab_000 | ac_000 | ad_000 | ae_000 | af_000 | ag_000 | ag_001 | ag_002 | ag_003 | ag_004 | ag_005 | ag_006 | ag_007 | ag_008 | ag_009 | ah_000 | ai_000 | aj_000 | ak_000 | al_000 | am_0 | an_000 | ao_000 | ap_000 | aq_000 | ar_000 | as_000 | at_000 | au_000 | av_000 | ax_000 | ay_000 | ay_001 | ay_002 | ay_003 | ay_004 | ay_005 | ay_006 | ay_007 | ay_008 | ay_009 | az_000 | az_001 | az_002 | az_003 | az_004 | az_005 | az_006 | az_007 | az_008 | az_009 | ba_000 | ba_001 | ba_002 | ba_003 | ba_004 | ba_005 | ba_006 | ba_007 | ba_008 | ba_009 | bb_000 | bc_000 | bd_000 | be_000 | bf_000 | bg_000 | bh_000 | bi_000 | bj_000 | bk_000 | bl_000 | bm_000 | bn_000 | bo_000 | bp_000 | bq_000 | br_000 | bs_000 | bt_000 | bu_000 | bv_000 | bx_000 | by_000 | bz_000 | ca_000 | cb_000 | cc_000 | cd_000 | ce_000 | cf_000 | cg_000 | ch_000 | ci_000 | cj_000 | ck_000 | cl_000 | cm_000 | cn_000 | cn_001 | cn_002 | cn_003 | cn_004 | cn_005 | cn_006 | cn_007 | cn_008 | cn_009 | co_000 | cp_000 | cq_000 | cr_000 | cs_000 | cs_001 | cs_002 | cs_003 | cs_004 | cs_005 | cs_006 | cs_007 | cs_008 | cs_009 | ct_000 | cu_000 | cv_000 | cx_000 | cy_000 | cz_000 | da_000 | db_000 | dc_000 | dd_000 | de_000 | df_000 | dg_000 | dh_000 | di_000 | dj_000 | dk_000 | dl_000 | dm_000 | dn_000 | do_000 | dp_000 | dq_000 | dr_000 | ds_000 | dt_000 | du_000 | dv_000 | dx_000 | dy_000 | dz_000 | ea_000 | eb_000 | ec_00 | ed_000 | ee_000 | ee_001 | ee_002 | ee_003 | ee_004 | ee_005 | ee_006 | ee_007 | ee_008 | ee_009 | ef_000 | eg_000 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10009 | -0.992188 | -0.203125 | 0.539062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.179688 | -0.250000 | -0.070312 | 0.007812 | 0.062500 | 0.023438 | -0.187500 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.187500 | -0.187500 | -0.179688 | -0.187500 | -0.101562 | -0.007812 | -0.023438 | -0.015625 | 0.445312 | 0.164062 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | 0.070312 | 0.164062 | -0.234375 | -0.242188 | -0.015625 | -0.046875 | -0.054688 | -0.054688 | -0.023438 | -0.046875 | -0.195312 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.070312 | -0.250000 | -0.242188 | -0.242188 | -0.226562 | -0.203125 | -0.140625 | 0.187500 | -0.125000 | -0.109375 | -0.187500 | -0.148438 | -0.195312 | -0.132812 | -0.062500 | -0.187500 | -0.195312 | -0.195312 | -0.140625 | -0.414062 | -0.554688 | -0.148438 | -0.132812 | -0.125000 | -0.117188 | -0.109375 | -0.109375 | -0.812500 | -0.203125 | -0.187500 | -0.187500 | -0.164062 | -0.140625 | -0.156250 | -0.992188 | -0.984375 | -0.164062 | 0.0 | -0.070312 | -0.007812 | 0.031250 | -0.015625 | -0.195312 | -0.093750 | -0.171875 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.210938 | -0.156250 | -0.070312 | -0.109375 | -0.117188 | -0.085938 | -0.031250 | -0.007812 | -0.078125 | -0.187500 | -0.085938 | -0.015625 | -0.070312 | -0.125000 | -0.125000 | -0.117188 | -0.132812 | -0.281250 | -0.070312 | -0.023438 | -0.007812 | 0.242188 | 0.054688 | -0.500000 | -0.210938 | -0.039062 | 0.234375 | -0.039062 | -0.226562 | -0.140625 | -0.117188 | -0.125000 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.171875 | -0.000000 | -0.171875 | -0.054688 | -0.164062 | -0.164062 | -0.250000 | 0.132812 | 0.031250 | -0.125000 | 0.031250 | -0.023438 | -0.03125 | -0.234375 | -0.054688 | -0.085938 | -0.148438 | -0.085938 | -0.132812 | -0.117188 | -0.117188 | -0.085938 | -0.171875 | -0.164062 | -0.179688 | -0.148438 | -0.023438 | -0.023438 |
| 15015 | -0.992188 | -0.390625 | -0.031250 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.179688 | -0.335938 | -0.398438 | -0.312500 | -0.140625 | -0.031250 | -0.414062 | -0.054688 | -0.015625 | -0.023438 | -0.109375 | -0.109375 | -0.421875 | -0.421875 | -0.304688 | -0.335938 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.054688 | -0.156250 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.078125 | -0.281250 | -0.296875 | -0.265625 | -0.015625 | -0.101562 | -0.125000 | -0.070312 | -0.125000 | -0.328125 | -0.320312 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.343750 | -0.351562 | -0.335938 | -0.359375 | -0.375000 | -0.367188 | -0.328125 | -0.351562 | -0.148438 | -0.109375 | -0.398438 | -0.148438 | -0.195312 | -0.156250 | -0.132812 | -0.414062 | -0.367188 | -0.312500 | -0.257812 | -0.960938 | -0.960938 | -0.882812 | -0.835938 | -0.820312 | -0.812500 | -0.796875 | -0.796875 | -0.234375 | -0.390625 | -0.398438 | -0.398438 | -0.382812 | -0.398438 | -0.171875 | -0.531250 | -0.531250 | -0.382812 | 0.0 | -0.398438 | -0.007812 | -0.203125 | -0.015625 | -0.398438 | -0.093750 | -0.296875 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.234375 | -0.359375 | -0.406250 | -0.289062 | -0.156250 | -0.101562 | -0.031250 | -0.007812 | -0.078125 | -0.398438 | -0.085938 | -0.460938 | -0.250000 | -0.195312 | -0.296875 | -0.203125 | -0.390625 | -0.398438 | -0.132812 | -0.007812 | -0.007812 | -0.085938 | -0.132812 | -0.492188 | -0.242188 | -0.046875 | -0.093750 | -0.039062 | -0.046875 | -0.500000 | -0.289062 | -0.171875 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.328125 | -0.437500 | -0.468750 | -0.054688 | -0.164062 | -0.398438 | -0.414062 | -0.351562 | -0.289062 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.226562 | -0.351562 | -0.367188 | -0.273438 | -0.289062 | -0.375000 | -0.375000 | -0.367188 | -0.320312 | -0.304688 | -0.195312 | -0.304688 | -0.171875 | -0.023438 | -0.023438 |
| 14062 | -0.992188 | -0.062500 | -0.289062 | 0.992188 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.171875 | -0.281250 | 0.054688 | 0.937500 | 0.312500 | -0.015625 | -0.046875 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | 0.007812 | 0.007812 | -0.109375 | -0.046875 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | 0.523438 | 0.218750 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.070312 | -0.234375 | -0.257812 | 0.609375 | -0.015625 | -0.023438 | -0.046875 | -0.054688 | -0.070312 | 0.328125 | -0.156250 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | 0.023438 | 0.054688 | 0.062500 | 0.093750 | 0.062500 | -0.046875 | -0.101562 | -0.078125 | -0.148438 | -0.109375 | -0.031250 | -0.132812 | -0.187500 | -0.125000 | -0.085938 | -0.046875 | 0.000000 | -0.125000 | -0.085938 | -0.523438 | -0.515625 | -0.562500 | -0.835938 | -0.820312 | -0.812500 | -0.796875 | -0.796875 | 0.062500 | -0.062500 | -0.031250 | -0.031250 | 0.007812 | 0.007812 | -0.171875 | 0.273438 | 0.375000 | 0.023438 | 0.0 | 0.906250 | -0.007812 | 0.125000 | -0.015625 | -0.023438 | -0.093750 | 0.015625 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.179688 | -0.312500 | 0.304688 | 0.687500 | -0.000000 | -0.054688 | -0.031250 | -0.007812 | -0.078125 | -0.031250 | -0.085938 | 0.437500 | 0.093750 | 0.015625 | 0.039062 | -0.046875 | 0.046875 | -0.039062 | 0.039062 | -0.015625 | -0.007812 | 0.406250 | 0.078125 | 0.250000 | -0.195312 | -0.046875 | -0.093750 | -0.039062 | 0.054688 | 0.148438 | 0.101562 | -0.023438 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.054688 | -0.203125 | -0.101562 | -0.054688 | -0.164062 | 0.367188 | 0.554688 | 0.523438 | 0.195312 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.171875 | 0.054688 | 0.203125 | 0.023438 | 0.085938 | 0.125000 | 0.093750 | 0.015625 | -0.093750 | -0.085938 | -0.101562 | 0.164062 | -0.148438 | -0.023438 | -0.023438 |
| 2287 | 0.992188 | 0.992188 | -0.289062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.023438 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.906250 | 0.148438 | 0.007812 | -0.023438 | 0.992188 | 0.054688 | -0.023438 | -0.023438 | 0.867188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.007812 | 0.039062 | -0.015625 | 0.992188 | 0.992188 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.023438 | 0.992188 | 0.992188 | -0.007812 | -0.015625 | 0.710938 | 0.203125 | 0.062500 | -0.101562 | -0.296875 | 0.992188 | 0.992188 | 0.992188 | -0.046875 | -0.015625 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.882812 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.531250 | 0.546875 | 0.882812 | 0.859375 | 0.992188 | 0.992188 | 0.968750 | 0.992188 | -0.460938 | -0.343750 | -0.414062 | -0.515625 | -0.398438 | -0.039062 | 0.039062 | -0.070312 | -0.515625 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.289062 | -0.593750 | 0.992188 | 0.0 | 0.992188 | -0.007812 | -0.234375 | -0.015625 | 0.992188 | 0.992188 | 0.992188 | 0.023438 | 0.992188 | 0.351562 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.007812 | -0.023438 | 0.992188 | -0.085938 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.695312 | -0.000000 | -0.007812 | -0.148438 | -0.093750 | -0.445312 | -0.156250 | -0.046875 | -0.085938 | -0.039062 | -0.226562 | -0.453125 | 0.992188 | 0.992188 | -0.023438 | -0.03125 | -0.007812 | 0.992188 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.023438 | -0.03125 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | -0.195312 | -0.171875 | -0.023438 | -0.023438 |
| 9207 | -0.992188 | -0.187500 | -0.289062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.179688 | -0.320312 | -0.085938 | 0.187500 | 0.125000 | -0.015625 | -0.203125 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.187500 | -0.187500 | -0.195312 | -0.187500 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.031250 | -0.109375 | -0.03125 | -0.015625 | -0.023438 | -0.03125 | -0.03125 | -0.062500 | -0.039062 | -0.101562 | -0.210938 | -0.015625 | -0.078125 | -0.109375 | -0.070312 | -0.085938 | 0.070312 | -0.265625 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.203125 | -0.234375 | -0.226562 | -0.210938 | -0.164062 | -0.109375 | 0.179688 | 0.445312 | -0.148438 | -0.109375 | -0.195312 | -0.148438 | -0.195312 | -0.156250 | -0.085938 | -0.203125 | -0.226562 | -0.171875 | -0.187500 | -0.429688 | -0.320312 | -0.117188 | -0.093750 | -0.093750 | -0.085938 | -0.078125 | -0.078125 | 0.914062 | -0.187500 | -0.195312 | -0.195312 | -0.171875 | -0.015625 | -0.171875 | 0.992188 | 0.992188 | -0.171875 | 0.0 | 0.085938 | -0.007812 | -0.109375 | -0.015625 | -0.195312 | -0.093750 | -0.171875 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.234375 | -0.320312 | -0.031250 | 0.226562 | -0.085938 | -0.078125 | -0.023438 | -0.007812 | -0.007812 | -0.195312 | -0.085938 | -0.179688 | -0.164062 | -0.156250 | -0.140625 | -0.148438 | -0.132812 | -0.179688 | -0.093750 | -0.015625 | -0.007812 | 0.046875 | -0.101562 | -0.078125 | -0.242188 | -0.046875 | -0.046875 | -0.039062 | 0.460938 | -0.140625 | -0.164062 | -0.054688 | -0.023438 | -0.03125 | -0.007812 | -0.101562 | -0.007812 | -0.03125 | -0.023438 | -0.03125 | -0.218750 | 0.109375 | -0.101562 | -0.054688 | -0.164062 | -0.164062 | -0.117188 | -0.351562 | -0.289062 | -0.203125 | -0.140625 | -0.023438 | -0.03125 | -0.195312 | -0.203125 | -0.226562 | -0.156250 | -0.132812 | -0.101562 | -0.125000 | -0.156250 | -0.187500 | -0.171875 | -0.140625 | 0.218750 | 0.062500 | -0.023438 | -0.023438 |
print("Dimensions del train set: " + str(test_df.shape))
Dimensions del train set: (16000, 171)
combined_df = pd.concat([train_df, test_df])
null_counts = combined_df.isnull().sum()
columnesAmbNulls = null_counts[null_counts > 0]
print("Columnes amb algún valor nul:")
print(columnesAmbNulls)
Columnes amb algún valor nul: Series([], dtype: int64)
Com es pot apreciar, el dataset ja ha estat netejat. De fet, no hi ha valors nuls (han estat imputats). Per tant, no cal fer res a nivell de neteja.
combined_df.info()
<class 'pandas.core.frame.DataFrame'> Index: 76000 entries, 0 to 15999 Columns: 171 entries, class to eg_000 dtypes: float64(171) memory usage: 99.7 MB
Observem que tenim 171 columnes i 76.000 entrades. A nivell de columnes haurem de fer algun tipus de reducció a l'hora d'entrenar els algoritmes.
combined_df.describe()
| class | aa_000 | ab_000 | ac_000 | ad_000 | ae_000 | af_000 | ag_000 | ag_001 | ag_002 | ag_003 | ag_004 | ag_005 | ag_006 | ag_007 | ag_008 | ag_009 | ah_000 | ai_000 | aj_000 | ak_000 | al_000 | am_0 | an_000 | ao_000 | ap_000 | aq_000 | ar_000 | as_000 | at_000 | au_000 | av_000 | ax_000 | ay_000 | ay_001 | ay_002 | ay_003 | ay_004 | ay_005 | ay_006 | ay_007 | ay_008 | ay_009 | az_000 | az_001 | az_002 | az_003 | az_004 | az_005 | az_006 | az_007 | az_008 | az_009 | ba_000 | ba_001 | ba_002 | ba_003 | ba_004 | ba_005 | ba_006 | ba_007 | ba_008 | ba_009 | bb_000 | bc_000 | bd_000 | be_000 | bf_000 | bg_000 | bh_000 | bi_000 | bj_000 | bk_000 | bl_000 | bm_000 | bn_000 | bo_000 | bp_000 | bq_000 | br_000 | bs_000 | bt_000 | bu_000 | bv_000 | bx_000 | by_000 | bz_000 | ca_000 | cb_000 | cc_000 | cd_000 | ce_000 | cf_000 | cg_000 | ch_000 | ci_000 | cj_000 | ck_000 | cl_000 | cm_000 | cn_000 | cn_001 | cn_002 | cn_003 | cn_004 | cn_005 | cn_006 | cn_007 | cn_008 | cn_009 | co_000 | cp_000 | cq_000 | cr_000 | cs_000 | cs_001 | cs_002 | cs_003 | cs_004 | cs_005 | cs_006 | cs_007 | cs_008 | cs_009 | ct_000 | cu_000 | cv_000 | cx_000 | cy_000 | cz_000 | da_000 | db_000 | dc_000 | dd_000 | de_000 | df_000 | dg_000 | dh_000 | di_000 | dj_000 | dk_000 | dl_000 | dm_000 | dn_000 | do_000 | dp_000 | dq_000 | dr_000 | ds_000 | dt_000 | du_000 | dv_000 | dx_000 | dy_000 | dz_000 | ea_000 | eb_000 | ec_00 | ed_000 | ee_000 | ee_001 | ee_002 | ee_003 | ee_004 | ee_005 | ee_006 | ee_007 | ee_008 | ee_009 | ef_000 | eg_000 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 | 76000.000000 |
| mean | -0.956286 | -0.123457 | -0.068506 | -0.204456 | -0.007744 | -0.032901 | -0.039935 | -0.006538 | -0.025861 | -0.040086 | -0.073948 | -0.096590 | -0.114123 | -0.100090 | -0.087194 | -0.053257 | -0.018698 | -0.128756 | -0.035237 | -0.019805 | -0.021407 | -0.061695 | -0.061888 | -0.126925 | -0.126813 | -0.114968 | -0.111576 | -0.063860 | -0.007479 | -0.025440 | -0.014966 | -0.054155 | -0.081160 | -0.028543 | -0.013480 | -0.020724 | -0.026993 | -0.026518 | -0.038354 | -0.081945 | -0.097188 | -0.102304 | -0.013018 | -0.033402 | -0.046559 | -0.027121 | -0.046227 | -0.101922 | -0.106999 | -0.068059 | -0.044009 | -0.035954 | -0.012872 | -0.103941 | -0.108328 | -0.109599 | -0.108458 | -0.105600 | -0.100647 | -0.093668 | -0.102299 | -0.068957 | -0.064875 | -0.126615 | -0.068250 | -0.077465 | -0.061668 | -0.058185 | -0.128755 | -0.120815 | -0.109564 | -0.104628 | -0.166983 | -0.160733 | -0.153714 | -0.151010 | -0.150269 | -0.151227 | -0.148933 | -0.151067 | -0.128062 | -0.123451 | -0.126614 | -0.126614 | -0.119450 | -0.114966 | -0.080539 | -0.121993 | -0.114327 | -0.119143 | -0.000125 | -0.099145 | -0.007769 | -0.069779 | -0.015386 | -0.125790 | -0.064805 | -0.108921 | -0.080057 | -0.091098 | -0.029625 | -0.055514 | -0.086066 | -0.101060 | -0.110041 | -0.102328 | -0.077996 | -0.065892 | -0.058477 | -0.021773 | -0.007714 | -0.038094 | -0.126614 | -0.063388 | -0.108046 | -0.069122 | -0.088596 | -0.098250 | -0.093793 | -0.112382 | -0.094442 | -0.019589 | -0.009793 | -0.007700 | -0.036870 | -0.059547 | -0.136807 | -0.115114 | -0.033793 | -0.039082 | -0.028328 | -0.038559 | -0.137726 | -0.095937 | -0.057752 | -0.019653 | -0.025454 | -0.005641 | -0.064918 | -0.007296 | -0.028855 | -0.021801 | -0.029184 | -0.112852 | -0.094615 | -0.105553 | -0.036685 | -0.094059 | -0.110459 | -0.110635 | -0.107180 | -0.087959 | -0.102316 | -0.073177 | -0.020095 | -0.024698 | -0.095093 | -0.108845 | -0.111245 | -0.097584 | -0.090798 | -0.104035 | -0.098011 | -0.093929 | -0.088263 | -0.102654 | -0.087985 | -0.084783 | -0.067596 | -0.019900 | -0.018338 |
| std | 0.264488 | 0.368933 | 0.354247 | 0.559990 | 0.003686 | 0.108952 | 0.114069 | 0.032640 | 0.067743 | 0.108607 | 0.189024 | 0.257076 | 0.356217 | 0.383781 | 0.360956 | 0.217399 | 0.066004 | 0.381073 | 0.114209 | 0.047797 | 0.040711 | 0.192398 | 0.191147 | 0.388615 | 0.387402 | 0.315066 | 0.334439 | 0.180781 | 0.017944 | 0.092934 | 0.024368 | 0.242139 | 0.307676 | 0.045847 | 0.041715 | 0.045337 | 0.055067 | 0.056572 | 0.123435 | 0.344880 | 0.319074 | 0.302856 | 0.045139 | 0.162942 | 0.196780 | 0.143283 | 0.198141 | 0.366730 | 0.339008 | 0.180041 | 0.121645 | 0.092093 | 0.041163 | 0.348218 | 0.349829 | 0.328218 | 0.343321 | 0.355520 | 0.359694 | 0.334376 | 0.385969 | 0.235300 | 0.185761 | 0.369529 | 0.235658 | 0.265082 | 0.222901 | 0.214957 | 0.381072 | 0.349708 | 0.321543 | 0.290430 | 0.685287 | 0.713004 | 0.734811 | 0.746764 | 0.750918 | 0.754995 | 0.754578 | 0.757787 | 0.672921 | 0.368991 | 0.369582 | 0.369582 | 0.358962 | 0.367994 | 0.248009 | 0.773618 | 0.785222 | 0.358546 | 0.093746 | 0.448757 | 0.003660 | 0.233482 | 0.015508 | 0.371980 | 0.147374 | 0.323502 | 0.157989 | 0.283576 | 0.085434 | 0.155023 | 0.223568 | 0.289119 | 0.361280 | 0.387668 | 0.341521 | 0.208554 | 0.146235 | 0.079368 | 0.003717 | 0.144504 | 0.369582 | 0.127878 | 0.403157 | 0.289388 | 0.248334 | 0.328049 | 0.248167 | 0.364734 | 0.432127 | 0.227275 | 0.028360 | 0.008929 | 0.200351 | 0.227182 | 0.434096 | 0.282545 | 0.090405 | 0.160865 | 0.094027 | 0.274599 | 0.429119 | 0.325940 | 0.250826 | 0.057126 | 0.064071 | 0.031859 | 0.165466 | 0.019745 | 0.046531 | 0.039596 | 0.042373 | 0.328632 | 0.429458 | 0.462232 | 0.100253 | 0.246342 | 0.384647 | 0.403113 | 0.409089 | 0.345190 | 0.285210 | 0.224485 | 0.051746 | 0.072933 | 0.297537 | 0.355387 | 0.376722 | 0.307001 | 0.306367 | 0.356875 | 0.362312 | 0.363698 | 0.337173 | 0.321224 | 0.239546 | 0.363531 | 0.261067 | 0.052789 | 0.062205 |
| min | -0.992188 | -0.406250 | -0.289062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.187500 | -0.335938 | -0.421875 | -0.351562 | -0.164062 | -0.031250 | -0.429688 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.437500 | -0.437500 | -0.320312 | -0.351562 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.171875 | -0.257812 | -0.031250 | -0.015625 | -0.023438 | -0.031250 | -0.031250 | -0.078125 | -0.328125 | -0.304688 | -0.265625 | -0.015625 | -0.109375 | -0.132812 | -0.078125 | -0.132812 | -0.351562 | -0.328125 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.367188 | -0.375000 | -0.343750 | -0.367188 | -0.375000 | -0.367188 | -0.328125 | -0.351562 | -0.148438 | -0.109375 | -0.414062 | -0.156250 | -0.195312 | -0.156250 | -0.132812 | -0.429688 | -0.375000 | -0.328125 | -0.281250 | -0.960938 | -0.960938 | -0.882812 | -0.835938 | -0.820312 | -0.812500 | -0.796875 | -0.796875 | -0.945312 | -0.406250 | -0.414062 | -0.414062 | -0.398438 | -0.406250 | -0.171875 | -0.992188 | -0.992188 | -0.398438 | -0.992188 | -0.445312 | -0.007812 | -0.250000 | -0.015625 | -0.414062 | -0.093750 | -0.328125 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.242188 | -0.382812 | -0.421875 | -0.312500 | -0.156250 | -0.101562 | -0.031250 | -0.007812 | -0.078125 | -0.414062 | -0.085938 | -0.531250 | -0.273438 | -0.195312 | -0.320312 | -0.210938 | -0.398438 | -0.460938 | -0.187500 | -0.023438 | -0.007812 | -0.156250 | -0.179688 | -0.523438 | -0.242188 | -0.046875 | -0.093750 | -0.039062 | -0.226562 | -0.523438 | -0.328125 | -0.234375 | -0.023438 | -0.031250 | -0.007812 | -0.101562 | -0.007812 | -0.031250 | -0.023438 | -0.031250 | -0.343750 | -0.445312 | -0.476562 | -0.054688 | -0.164062 | -0.414062 | -0.437500 | -0.351562 | -0.289062 | -0.203125 | -0.140625 | -0.023438 | -0.031250 | -0.242188 | -0.406250 | -0.429688 | -0.304688 | -0.304688 | -0.382812 | -0.382812 | -0.382812 | -0.351562 | -0.312500 | -0.195312 | -0.304688 | -0.171875 | -0.023438 | -0.023438 |
| 25% | -0.992188 | -0.398438 | -0.289062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.179688 | -0.335938 | -0.421875 | -0.351562 | -0.164062 | -0.031250 | -0.421875 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.429688 | -0.429688 | -0.312500 | -0.343750 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.171875 | -0.250000 | -0.031250 | -0.015625 | -0.023438 | -0.031250 | -0.031250 | -0.078125 | -0.328125 | -0.296875 | -0.257812 | -0.015625 | -0.093750 | -0.125000 | -0.078125 | -0.132812 | -0.351562 | -0.320312 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.359375 | -0.375000 | -0.335938 | -0.359375 | -0.375000 | -0.367188 | -0.328125 | -0.351562 | -0.148438 | -0.109375 | -0.406250 | -0.156250 | -0.195312 | -0.156250 | -0.132812 | -0.421875 | -0.375000 | -0.320312 | -0.273438 | -0.625000 | -0.648438 | -0.882812 | -0.835938 | -0.820312 | -0.812500 | -0.796875 | -0.796875 | -0.742188 | -0.398438 | -0.406250 | -0.406250 | -0.390625 | -0.398438 | -0.171875 | -0.882812 | -0.882812 | -0.390625 | 0.000000 | -0.445312 | -0.007812 | -0.226562 | -0.015625 | -0.406250 | -0.093750 | -0.320312 | -0.109375 | -0.218750 | -0.039062 | -0.085938 | -0.148438 | -0.234375 | -0.375000 | -0.421875 | -0.312500 | -0.156250 | -0.101562 | -0.031250 | -0.007812 | -0.078125 | -0.406250 | -0.085938 | -0.406250 | -0.257812 | -0.195312 | -0.312500 | -0.210938 | -0.390625 | -0.453125 | -0.171875 | -0.023438 | -0.007812 | -0.148438 | -0.164062 | -0.507812 | -0.242188 | -0.046875 | -0.093750 | -0.039062 | -0.226562 | -0.515625 | -0.312500 | -0.195312 | -0.023438 | -0.031250 | -0.007812 | -0.101562 | -0.007812 | -0.031250 | -0.023438 | -0.031250 | -0.335938 | -0.445312 | -0.476562 | -0.054688 | -0.164062 | -0.414062 | -0.437500 | -0.351562 | -0.289062 | -0.203125 | -0.140625 | -0.023438 | -0.031250 | -0.242188 | -0.375000 | -0.406250 | -0.296875 | -0.296875 | -0.382812 | -0.382812 | -0.375000 | -0.343750 | -0.312500 | -0.195312 | -0.304688 | -0.171875 | -0.023438 | -0.023438 |
| 50% | -0.992188 | -0.195312 | -0.289062 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.179688 | -0.281250 | -0.187500 | -0.265625 | -0.156250 | -0.031250 | -0.195312 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.195312 | -0.195312 | -0.210938 | -0.210938 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.156250 | -0.210938 | -0.031250 | -0.015625 | -0.023438 | -0.031250 | -0.031250 | -0.078125 | -0.273438 | -0.234375 | -0.234375 | -0.015625 | -0.078125 | -0.109375 | -0.062500 | -0.125000 | -0.328125 | -0.250000 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.187500 | -0.187500 | -0.187500 | -0.187500 | -0.187500 | -0.203125 | -0.218750 | -0.343750 | -0.148438 | -0.109375 | -0.195312 | -0.148438 | -0.187500 | -0.140625 | -0.132812 | -0.195312 | -0.203125 | -0.210938 | -0.195312 | -0.445312 | -0.445312 | -0.414062 | -0.429688 | -0.460938 | -0.484375 | -0.554688 | -0.796875 | -0.343750 | -0.195312 | -0.195312 | -0.195312 | -0.179688 | -0.171875 | -0.171875 | -0.304688 | -0.335938 | -0.179688 | 0.000000 | -0.421875 | -0.007812 | -0.125000 | -0.015625 | -0.195312 | -0.093750 | -0.210938 | -0.109375 | -0.210938 | -0.039062 | -0.085938 | -0.148438 | -0.226562 | -0.226562 | -0.203125 | -0.242188 | -0.132812 | -0.093750 | -0.031250 | -0.007812 | -0.078125 | -0.195312 | -0.085938 | -0.218750 | -0.148438 | -0.179688 | -0.210938 | -0.171875 | -0.179688 | -0.257812 | -0.109375 | -0.015625 | -0.007812 | -0.109375 | -0.140625 | -0.218750 | -0.218750 | -0.046875 | -0.093750 | -0.039062 | -0.226562 | -0.164062 | -0.195312 | -0.140625 | -0.023438 | -0.031250 | -0.007812 | -0.101562 | -0.007812 | -0.031250 | -0.023438 | -0.031250 | -0.195312 | -0.265625 | -0.296875 | -0.054688 | -0.164062 | -0.203125 | -0.210938 | -0.335938 | -0.273438 | -0.203125 | -0.140625 | -0.023438 | -0.031250 | -0.226562 | -0.203125 | -0.203125 | -0.195312 | -0.171875 | -0.179688 | -0.179688 | -0.187500 | -0.179688 | -0.226562 | -0.171875 | -0.296875 | -0.171875 | -0.023438 | -0.023438 |
| 75% | -0.992188 | -0.070312 | 0.000000 | -0.468750 | -0.007812 | -0.046875 | -0.054688 | -0.007812 | -0.031250 | -0.054688 | -0.117188 | -0.164062 | -0.054688 | 0.062500 | 0.062500 | -0.039062 | -0.031250 | -0.046875 | -0.054688 | -0.023438 | -0.023438 | -0.109375 | -0.109375 | -0.039062 | -0.046875 | -0.093750 | -0.054688 | -0.101562 | -0.007812 | -0.039062 | -0.015625 | -0.070312 | -0.078125 | -0.031250 | -0.015625 | -0.023438 | -0.031250 | -0.031250 | -0.054688 | 0.054688 | -0.039062 | -0.109375 | -0.015625 | -0.046875 | -0.070312 | -0.046875 | -0.070312 | 0.070312 | -0.054688 | -0.109375 | -0.062500 | -0.046875 | -0.015625 | -0.031250 | -0.031250 | -0.062500 | -0.039062 | -0.015625 | -0.007812 | -0.007812 | 0.039062 | -0.140625 | -0.109375 | -0.062500 | -0.117188 | -0.109375 | -0.085938 | -0.101562 | -0.046875 | -0.054688 | -0.078125 | -0.093750 | 0.242188 | 0.656250 | 0.789062 | 0.835938 | 0.843750 | 0.851562 | 0.851562 | 0.859375 | 0.453125 | -0.070312 | -0.062500 | -0.062500 | -0.039062 | -0.031250 | -0.148438 | 0.750000 | 0.804688 | -0.039062 | 0.000000 | 0.148438 | -0.007812 | -0.015625 | -0.015625 | -0.062500 | -0.093750 | -0.070312 | -0.109375 | -0.148438 | -0.039062 | -0.085938 | -0.140625 | -0.132812 | -0.023438 | 0.054688 | 0.031250 | -0.078125 | -0.078125 | -0.031250 | -0.007812 | -0.070312 | -0.062500 | -0.085938 | 0.023438 | -0.031250 | -0.117188 | -0.054688 | -0.109375 | -0.031250 | 0.117188 | 0.039062 | -0.007812 | -0.007812 | -0.015625 | -0.070312 | 0.062500 | -0.164062 | -0.046875 | -0.070312 | -0.039062 | 0.078125 | 0.023438 | -0.054688 | -0.054688 | -0.023438 | -0.031250 | -0.007812 | -0.101562 | -0.007812 | -0.031250 | -0.023438 | -0.031250 | -0.062500 | 0.125000 | 0.078125 | -0.054688 | -0.164062 | 0.015625 | 0.039062 | -0.046875 | -0.031250 | -0.203125 | -0.140625 | -0.023438 | -0.031250 | -0.140625 | -0.023438 | -0.007812 | -0.062500 | -0.046875 | -0.007812 | 0.015625 | 0.017578 | 0.007812 | -0.054688 | -0.101562 | 0.000000 | -0.132812 | -0.023438 | -0.023438 |
| max | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 | 0.992188 |
#Arrodonim els valors de class a 1 o -1, per evitar valors decimals.
train_df['class'] = train_df['class'].round().replace(-1, 0).replace(-1, 0)
test_df['class'] = test_df['class'].round().replace(-1, 0)
train_df['class'] = train_df['class'].astype(int)
test_df['class'] = test_df['class'].astype(int)
sns.barplot(data=combined_df,x=combined_df['class'].unique(),y=combined_df['class'].value_counts())
<Axes: ylabel='count'>
pos = combined_df['class'].round().value_counts().get(1, 0)
neg = combined_df['class'].round().replace(-1, 0).value_counts().get(0, 0)
perc = pos/neg*100
print("Núm valors '1':", pos, ", Núm valors '0':", neg, " Perc. positius:", round(perc,2),"%")
Núm valors '1': 1375 , Núm valors '0': 74625 Perc. positius: 1.84 %
El dataset està mot poc balancejat, ja que només el 1.84% dels registres són positius.
correlation_matrix = combined_df.corr()
class_correlation = correlation_matrix['class']
sorted_correlation = class_correlation.abs().sort_values(ascending=False)
top_correlated_variables = sorted_correlation[0:11].index
top_correlated_variables = top_correlated_variables.tolist()
filtered_correlation_matrix = correlation_matrix.loc[top_correlated_variables, top_correlated_variables]
sns.set(style="white")
mask = np.triu(np.ones_like(filtered_correlation_matrix, dtype=bool))
plt.figure(figsize=(6, 6))
sns.heatmap(filtered_correlation_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f")
plt.show()
Observem que les variables més correlacionades amb la vairable objectiu tenen un nivell de correlació d'entre 0.43 i 0.51 (sent ag_002 la més correlacionada).
D'altra banda, veiem que algunes variables també estan molt correlacionades entre elles, com (ap_000 i bj_000) o (az_001 i az_002). Això podria indicar que podriem quedar-nos només amb una variable de cada parella.
sns.pairplot(combined_df, vars=top_correlated_variables)
C:\Users\Ramon\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<seaborn.axisgrid.PairGrid at 0x25bbc1ae790>
Podem observar la correlació entre les diferents classes. Sent les els plots que tenen una forma de "diagonalització" les variables amb més correlació. Podem observar també que les quatre variables més correlacionades amb la variable objectiu solen estar molt a prop del 0 quan la variable objectiu és negativa i solen estar a prop del 1 quan la variable objectiu és positiva.
rows_p = combined_df[combined_df['class'] >0]
rows_n = combined_df[combined_df['class'] < 0]
abs_p = rows_p['ag_002'].abs()
ave_p = abs_p.mean()
median_p = abs_p.median()
std_p = abs_p.std()
# Display the results
print("Casos positius")
print(f"Average (absolute): {ave_p}")
print(f"Median: {median_p}")
print(f"Standard Deviation: {std_p}\n\n")
abs_n = rows_n['ag_002'].abs()
ave_n = abs_n.mean()
median_n = abs_n.median()
std_n = abs_n.std()
print("Casos negatius")
print(f"Average (absolute): {ave_n}")
print(f"Median: {median_n}")
print(f"Standard Deviation: {std_n}")
Casos positius Average (absolute): 0.4271193181818182 Median: 0.0546875 Standard Deviation: 0.4413658887175721 Casos negatius Average (absolute): 0.05892033082077052 Median: 0.0546875 Standard Deviation: 0.0568288713371542
fig, axs = plt.subplots(4, 2, figsize=(12, 16))
axs[0][0].hist(rows_p['ag_002'], bins=50, color='red', alpha=0.7)
axs[0][0].set_title('Distribució dels positius (ag_002)')
axs[0][0].set_xlabel('Positius')
axs[0][0].set_ylabel('Freqüència')
axs[0][1].hist(rows_n['ag_002'], bins=50, color='green', alpha=0.7)
axs[0][1].set_title('Distribució dels negatius (ag_002)')
axs[0][1].set_xlabel('Negatius')
axs[0][1].set_ylabel('Freqüència')
axs[1][0].hist(rows_p['az_000'], bins=50, color='red', alpha=0.7)
axs[1][0].set_title('Distribució dels positius (az_000)')
axs[1][0].set_xlabel('Positius')
axs[1][0].set_ylabel('Freqüència')
axs[1][1].hist(rows_n['az_000'], bins=50, color='green', alpha=0.7)
axs[1][1].set_title('Distribució dels negatius (az_000)')
axs[1][1].set_xlabel('Negatius')
axs[1][1].set_ylabel('Freqüència')
axs[2][0].hist(rows_p['az_002'], bins=50, color='red', alpha=0.7)
axs[2][0].set_title('Distribució dels positius (az_002)')
axs[2][0].set_xlabel('Positius')
axs[2][0].set_ylabel('Freqüència')
axs[2][1].hist(rows_n['az_002'], bins=50, color='green', alpha=0.7)
axs[2][1].set_title('Distribució dels negatius (az_002)')
axs[2][1].set_xlabel('Negatius')
axs[2][1].set_ylabel('Freqüència')
axs[3][0].hist(rows_p['cn_000'], bins=50, color='red', alpha=0.7)
axs[3][0].set_title('Distribució dels positius (cn_000)')
axs[3][0].set_xlabel('Positius')
axs[3][0].set_ylabel('Freqüència')
axs[3][1].hist(rows_n['cn_000'], bins=50, color='green', alpha=0.7)
axs[3][1].set_title('Distribució dels negatius (cn_000)')
axs[3][1].set_xlabel('Negatius')
axs[3][1].set_ylabel('Freqüència')
plt.tight_layout()
plt.show()
y_train = train_df['class'].copy()
X_train = train_df.drop(['class'],axis=1).copy()
y_test = test_df['class'].copy()
X_test = test_df.drop(['class'],axis=1).copy()
X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])
# Utilitzem RandomUnderSampler amb la finalitat de reduir el número de mostres de la classe majoritària
rus = RandomUnderSampler(sampling_strategy=0.2, random_state=seed_value)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
# Utilitzem SMOTE amb la finalitat d'augmentar el número de mostres de la classe minoritària
#smote = SMOTE(sampling_strategy='auto', random_state=seed_value)
#X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)
ros = RandomOverSampler(sampling_strategy='auto', random_state=seed_value)
X_resampled, y_resampled = ros.fit_resample(X_resampled, y_resampled)
rus = RandomUnderSampler(sampling_strategy=0.2, random_state=seed_value)
X_res, y_res = rus.fit_resample(X, y)
# Utilitzem SMOTE amb la finalitat d'augmentar el número de mostres de la classe minoritària
#smote = SMOTE(sampling_strategy='auto', random_state=seed_value)
#X_resampled, y_resampled = smote.fit_resample(X_resampled, y_resampled)
ros = RandomOverSampler(sampling_strategy='auto', random_state=seed_value)
X_res, y_res = ros.fit_resample(X_res, y_res)
print("Nombre de mostres positives:", sum(y_resampled == 1))
print("Nombre de mostres negatives:", sum(y_resampled == 0))
Nombre de mostres positives: 5000 Nombre de mostres negatives: 5000
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, balanced_accuracy_score, f1_score, precision_score, accuracy_score
#Funció per calcular la matriu de confusió, la precisió, el recall i el F1
def calcularQualitatResultats(y_test, y_pred):
confusion_mat = confusion_matrix(y_test, y_pred)
confusion_mat_inverted = confusion_mat[::-1, ::-1]
print(confusion_mat_inverted)
precision = precision_score(y_test, y_pred)
print(f'Precision: {round(precision, 3)}')
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(accuracy, 3)}')
recall = recall_score(y_test, y_pred)
print(f'Recall: {round(recall, 3)}')
f1 = f1_score(y_test, y_pred, labels=[1], average='micro')
print(f'F1: {round(f1, 3)}')
from sklearn.model_selection import cross_val_score
def calcularValidacioCreuada(model, x, y):
cv_scores = cross_val_score(model, x, y, cv=5, scoring='recall')
print("Scores de validació creuada (recall):", cv_scores)
print("Mitjana dels scores (recall):", cv_scores.mean())
cv_scores = cross_val_score(model, x, y, cv=5, scoring='f1')
print("Scores de validació creuada (f1):", cv_scores)
print("Mitjana dels scores (f1):", cv_scores.mean())
top_correlated_variables = top_correlated_variables[1:11]
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=seed_value), param_grid, cv=5)
grid_search.fit(X[top_correlated_variables], y)
print("Millors paràmetres trobats a través de Grid Search:")
print(grid_search.best_params_)
Millors paràmetres trobats a través de Grid Search:
{'max_depth': 20, 'n_estimators': 100}
grid_search = GridSearchCV(RandomForestClassifier(random_state=seed_value), param_grid, cv=5)
grid_search.fit(X_res[top_correlated_variables], y_res)
print("Millors paràmetres trobats a través de Grid Search:")
print(grid_search.best_params_)
Millors paràmetres trobats a través de Grid Search:
{'max_depth': None, 'n_estimators': 100}
# Entrenem amb les variables més correlacionades
model_rfc = RandomForestClassifier(random_state=seed_value,class_weight={0: 0.98, 1: 0.02})
#model_rfc.fit(X_train[top_correlated_variables], y_train)
#y_pred = model_rfc.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc, X[top_correlated_variables], y)
Scores de validació creuada (recall): [0.49454545 0.46545455 0.42545455 0.49818182 0.52 ] Mitjana dels scores (recall): 0.4807272727272728 Scores de validació creuada (f1): [0.60176991 0.5577342 0.54418605 0.59956236 0.62582057] Mitjana dels scores (f1): 0.5858146189950766
#Entrenem amb max_depth=20
model_rfc = RandomForestClassifier(random_state=seed_value,class_weight={0: 0.98, 1: 0.02}, max_depth=20)
#model_rfc.fit(X_train[top_correlated_variables], y_train)
#y_pred = model_rfc.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc, X[top_correlated_variables], y)
Scores de validació creuada (recall): [0.42909091 0.37090909 0.35636364 0.39636364 0.45818182] Mitjana dels scores (recall): 0.40218181818181814 Scores de validació creuada (f1): [0.55924171 0.49514563 0.49122807 0.52403846 0.59294118] Mitjana dels scores (f1): 0.5325190090827173
model_rfc_resampling = RandomForestClassifier(random_state=seed_value)
#model_rfc_resampling.fit(X_resampled[top_correlated_variables], y_resampled)
#y_pred = model_rfc_resampling.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc_resampling, X_res[top_correlated_variables], y_res)
Scores de validació creuada (recall): [0.99418182 0.99781818 0.99272727 0.99636364 0.99563636] Mitjana dels scores (recall): 0.9953454545454546 Scores de validació creuada (f1): [0.98098314 0.98175313 0.97360913 0.97997139 0.982771 ] Mitjana dels scores (f1): 0.9798175563872986
Podem observar una clara superioritat quan entrenem el model amb resampling.
#Mostrem un simple arbre de decisió per obtenir una mostra visual
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
model_dt = DecisionTreeClassifier(random_state=seed_value)
model_dt.fit(X[top_correlated_variables], y)
plt.figure(figsize=(12, 8))
plot_tree(model_dt, filled=True, feature_names=top_correlated_variables, class_names=["Negatiu", "Positiu"], rounded=True)
plt.show()
#Mostrem un simple arbre de decisió per obtenir una mostra visual amb SAMPLING
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
model_dt_resampling = DecisionTreeClassifier(random_state=seed_value)
model_dt_resampling.fit(X_res[top_correlated_variables], y_res)
plt.figure(figsize=(12, 8))
plot_tree(model_dt_resampling, filled=True, feature_names=top_correlated_variables, class_names=["Negatiu", "Positiu"], rounded=True)
plt.show()
Fins i tot, al fer una representació gràfica del model, observem que amb resampling obtenim un model més clar en el que només amb la primera condició ja podriem encertar amb bastant bona fiabilitat la classe objectiu.
#Entrenem amb les variables més importants trobades durant l'entrenament
tmp = RandomForestClassifier(random_state=seed_value)
tmp.fit(X, y)
feature_importance = tmp.feature_importances_
feature_ranking = sorted(zip(feature_importance, X_train.columns), reverse=True)
for importance, feature in feature_ranking[:20]:
print(f'{feature}: {importance}')
aq_000: 0.03859702646103143 ag_002: 0.038109633323867007 ag_001: 0.03681617357229708 ap_000: 0.03152896997784845 bj_000: 0.026646179157282082 dn_000: 0.026132009097502244 cn_000: 0.023751866925356125 ck_000: 0.02020124709693456 bh_000: 0.019129322753020418 ay_005: 0.017189392835597176 ai_000: 0.015381877476428978 az_002: 0.014815515057561863 ay_006: 0.013731624733032657 az_000: 0.013580614631730034 bk_000: 0.013115031158206412 az_001: 0.012420981375734797 aa_000: 0.012033814175385696 cs_002: 0.012033651116092997 am_0: 0.011548522552351361 bb_000: 0.010681172815911157
#Entrenem amb les variables més importants trobades durant l'entrenament
tmp = RandomForestClassifier(random_state=seed_value)
tmp.fit(X_res, y_res)
feature_importance_sampling = tmp.feature_importances_
feature_ranking_sampling = sorted(zip(feature_importance_sampling, X_resampled.columns), reverse=True)
for importance, feature in feature_ranking_sampling[:20]:
print(f'{feature}: {importance}')
ap_000: 0.08458694779793273 dn_000: 0.07467220850740514 bt_000: 0.06503089099465721 aa_000: 0.060021924069825074 bb_000: 0.04808749844273397 ci_000: 0.04334849542532763 ck_000: 0.043266593945758026 bv_000: 0.03658317213071928 bj_000: 0.03526801400239967 bh_000: 0.0344362316041101 ah_000: 0.03219006177139122 bu_000: 0.03215038108719533 aq_000: 0.02490782481520959 bx_000: 0.022013141177277748 bi_000: 0.021951071384568796 az_000: 0.017016968591169076 bg_000: 0.015944205105030827 ba_002: 0.01557171127840761 cs_002: 0.014912900287000392 cq_000: 0.011393214548399217
top_10_features = [feature for importance, feature in feature_ranking[:10]]
top_10_features_sampling = [feature for importance, feature in feature_ranking_sampling[:10]]
#Cerquem quin és el nombre òptim d'estimadors
from sklearn.metrics import f1_score
from collections import OrderedDict
from sklearn.ensemble import RandomForestClassifier
ensemble_clfs = [ ("RandomForestClassifier, max_features=None",RandomForestClassifier(warm_start=True, max_features=None, oob_score=True,))]
error_rate = OrderedDict((label, []) for label, _ in ensemble_clfs)
estimators = [10,20,30,50,100,300,500]
f1_scores = {}
for label, clf in ensemble_clfs:
f1_scores[label] = []
for i in estimators:
clf.set_params(n_estimators=i)
clf.fit(X_train[top_10_features], y_train) # Train on your training data, not X_test
y_pred = clf.predict(X_test[top_10_features])
f1 = f1_score(y_test, y_pred, labels=[1], average='micro') # You can choose 'micro' or 'macro' depending on your needs
f1_scores[label].append((i, f1))
for label, clf_err in f1_scores.items():
xs, ys = zip(*clf_err)
plt.plot(xs, ys, label=label)
plt.xlim(0, 500)
plt.xlabel("n_estimators")
plt.ylabel("F1 Score")
plt.legend(loc="upper right")
plt.show()
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:578: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates. warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\ensemble\_forest.py:578: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable OOB estimates. warn(
Observem que a partir de 50 podria ser un nombre força òptim d'estimadors. Tenint en compte que al no especificar els estimadors agafa 100 estimadors per defecte, ja ens trobariem en el punt òptim i no cal especificar-li.
model_rfc2 = RandomForestClassifier(random_state=seed_value, class_weight={0: 0.98, 1: 0.02})
#model_rfc2.fit(X_train[top_10_features], y_train)
#y_pred = model_rfc2.predict(X_test[top_10_features])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc2, X[top_10_features], y)
Scores de validació creuada (recall): [0.41090909 0.41090909 0.39636364 0.44 0.43272727] Mitjana dels scores (recall): 0.41818181818181815 Scores de validació creuada (f1): [0.5368171 0.53809524 0.52278177 0.56018519 0.55868545] Mitjana dels scores (f1): 0.5433129492015831
model_rfc2_resampling = RandomForestClassifier(random_state=seed_value)
#model_rfc2_resampling.fit(X_resampled[top_10_features_sampling], y_resampled)
#y_pred = model_rfc2_resampling.predict(X_test[top_10_features_sampling])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc2_resampling, X_res[top_10_features_sampling], y_res)
Scores de validació creuada (recall): [0.99490909 0.99854545 0.99781818 0.99636364 0.99781818] Mitjana dels scores (recall): 0.997090909090909 Scores de validació creuada (f1): [0.98134864 0.97896613 0.97512438 0.97787295 0.98175313] Mitjana dels scores (f1): 0.9790130451034489
#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=10, random_state=seed_value)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
X_pca = pca.fit_transform(X)
from sklearn.decomposition import PCA
pca = PCA(n_components=10, random_state=seed_value)
X_resampled_pca = pca.fit_transform(X_resampled)
X_res_pca = pca.fit_transform(X_res)
model_rfc3 = RandomForestClassifier(random_state=seed_value)
#model_rfc3.fit(X_train_pca, y_train)
#y_pred = model_rfc3.predict(X_test_pca)
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc3, X_pca, y)
Scores de validació creuada (recall): [0.33090909 0.31636364 0.31272727 0.34909091 0.42181818] Mitjana dels scores (recall): 0.3461818181818182 Scores de validació creuada (f1): [0.45614035 0.45430809 0.43877551 0.47058824 0.50877193] Mitjana dels scores (f1): 0.4657168240389463
#Entrenem amb PCA i SAMPLING
model_rfc3_resampling = RandomForestClassifier(random_state=seed_value)
#model_rfc3_resampling.fit(X_resampled_pca[:10000], y_resampled)
#y_pred = model_rfc3_resampling.predict(X_test_pca)
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_rfc3_resampling, X_res_pca, y_res)
Scores de validació creuada (recall): [0.99854545 1. 1. 1. 1. ] Mitjana dels scores (recall): 0.9997090909090909 Scores de validació creuada (f1): [0.97896613 0.97621583 0.97552324 0.97795164 0.98039216] Mitjana dels scores (f1): 0.9778097984504148
En els casos que utilitzem PCA veiem que la qualitat del model acaba sent una mica inferior a la qualitat obtinguda utilitzant les 10 característiques més importants.
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
xgb_classifier = XGBClassifier(n_estimators=100, objective="binary:logistic", seed=seed_value)
#y_pred_prob = model.predict(dtest)
#y_pred = [1 if prob >= 0.15 else 0 for prob in y_pred_prob]
calcularValidacioCreuada(xgb_classifier, X, y)
Scores de validació creuada (recall): [0.69454545 0.69090909 0.78909091 0.76727273 0.77454545] Mitjana dels scores (recall): 0.7432727272727273 Scores de validació creuada (f1): [0.77800407 0.77393075 0.8460039 0.82101167 0.80834915] Mitjana dels scores (f1): 0.8054599089562393
xgb_classifier = XGBClassifier(n_estimators=100, objective="binary:logistic", seed=seed_value)
#y_pred_prob = model.predict(dtest)
#y_pred = [1 if prob >= 0.15 else 0 for prob in y_pred_prob]
calcularValidacioCreuada(xgb_classifier, X_res, y_res)
Scores de validació creuada (recall): [0.99709091 1. 1. 1. 1. ] Mitjana dels scores (recall): 0.9994181818181819 Scores de validació creuada (f1): [0.98597627 0.98743268 0.98284489 0.98920863 0.99170573] Mitjana dels scores (f1): 0.9874336397473786
from sklearn.naive_bayes import GaussianNB
model_nb = GaussianNB()
#model_nb.fit(X_train[top_correlated_variables], y_train)
#y_pred_nb = model_nb.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred_nb)
calcularValidacioCreuada(model_nb, X[top_correlated_variables], y)
Scores de validació creuada (recall): [0.85454545 0.82909091 0.88727273 0.84727273 0.89818182] Mitjana dels scores (recall): 0.8632727272727273 Scores de validació creuada (f1): [0.38461538 0.39790576 0.42324371 0.40985048 0.44107143] Mitjana dels scores (f1): 0.4113373536267472
model_nb_resampling = GaussianNB()
model_nb_resampling.fit(X_resampled, y_resampled)
#y_pred_nb = model_nb_resampling.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred_nb)
calcularValidacioCreuada(model_nb_resampling, X_res, y_res)
Scores de validació creuada (recall): [0.94254545 0.96363636 0.93527273 0.94181818 0.93890909] Mitjana dels scores (recall): 0.9444363636363636 Scores de validació creuada (f1): [0.93270961 0.93905032 0.92484718 0.92998205 0.93618564] Mitjana dels scores (f1): 0.9325549584835452
Amb Naive Bayes no hem obtingut tan bons resultats com anteriorment. Cal destacar, que sense resampling obtenim un f1 molt baix, malgrat que el recall és alt. Això és perquè hi ha hagut molts falsos positius i, per tant, una baixa precisió.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
model_lr = LogisticRegression()
#model_lr.fit(X_train, y_train)
#y_pred = model_lr.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_lr, X[top_correlated_variables], y)
Scores de validació creuada (recall): [0.34909091 0.29454545 0.31636364 0.31272727 0.33818182] Mitjana dels scores (recall): 0.3221818181818182 Scores de validació creuada (f1): [0.45933014 0.41968912 0.44615385 0.45263158 0.47328244] Mitjana dels scores (f1): 0.4502174261121922
model_lr_resampling = LogisticRegression()
#model_lr_resampling.fit(X_resampled[top_correlated_variables], y_resampled)
#y_pred = model_lr_resampling.predict(X_test[top_correlated_variables])
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(model_lr_resampling, X_res[top_correlated_variables], y_res)
Scores de validació creuada (recall): [0.93527273 0.94909091 0.92654545 0.93309091 0.936 ] Mitjana dels scores (recall): 0.9360000000000002 Scores de validació creuada (f1): [0.94385321 0.94496741 0.93230882 0.94130594 0.94424065] Mitjana dels scores (f1): 0.9413352064964761
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.metrics")
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# Primer obtindrem només quins són els millors hiperparàmetres per entrenar el SVC
svm = SVC(random_state=seed_value)
param_grid = {
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'C': [0.1, 1, 10],
'degree': [2, 3, 4], # Només per al nucli polinòmic
'gamma': ['scale', 'auto', 0.1, 1] # Només per als nuclis rbf, poly, i sigmoid
}
scoring = {
'precision': make_scorer(precision_score),
'recall': make_scorer(recall_score),
'f1': make_scorer(f1_score)
}
print("GridSearchCV")
grid_search = GridSearchCV(svm, param_grid, scoring=scoring, refit='f1', cv=5)
print("fit")
grid_search.fit(X_train[top_correlated_variables][:3000], y_train[:3000]) # utilitzem només 3000 registres perquè no tardi massa
best_svm_model = grid_search.best_estimator_
print("Millors hiperparàmetres:")
print(grid_search.best_params_)
GridSearchCV
fit
Millors hiperparàmetres:
{'C': 1, 'degree': 4, 'gamma': 1, 'kernel': 'poly'}
svm_model = SVC(C=1, degree=4, gamma=1, kernel='poly', random_state=seed_value)
#svm_model.fit(X_train, y_train)
#y_pred = svm_model.predict(X_test)
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(svm_model, X[top_correlated_variables], y)
Scores de validació creuada (recall): [0.36363636 0.30909091 0.30909091 0.28363636 0.36727273] Mitjana dels scores (recall): 0.3265454545454545 Scores de validació creuada (f1): [0.50505051 0.44270833 0.45092838 0.42276423 0.51530612] Mitjana dels scores (f1): 0.4673515140875918
svm_model_resampling = SVC(C=1, degree=4, gamma=1, kernel='poly', random_state=seed_value)
#svm_model.fit(X_resampled, y_resampled)
#y_pred = svm_model.predict(X_test)
#calcularQualitatResultats(y_test, y_pred)
calcularValidacioCreuada(svm_model_resampling, X_res[top_correlated_variables], y_res)
Scores de validació creuada (recall): [0.93163636 0.94327273 0.91781818 0.936 0.928 ] Mitjana dels scores (recall): 0.9313454545454546 Scores de validació creuada (f1): [0.94888889 0.94809942 0.93377728 0.94493392 0.94518519] Mitjana dels scores (f1): 0.9441769388964607
import tensorflow as tf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from tensorflow.keras import backend as K
def f1_metric(y_true, y_pred):
y_pred = K.round(y_pred)
precision = K.sum(y_true * y_pred) / (K.sum(y_pred) + K.epsilon())
recall = K.sum(y_true * y_pred) / (K.sum(y_true) + K.epsilon())
f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
return f1
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), f1_metric])
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]
# Evaluate the quality of the results using your function
calcularQualitatResultats(y_test, y_pred)
WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead. WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead. Epoch 1/20 WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead. WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead. 1875/1875 [==============================] - 4s 2ms/step - loss: 0.0347 - recall: 0.5150 - f1_metric: 0.2146 - val_loss: 0.0302 - val_recall: 0.5627 - val_f1_metric: 0.3247 Epoch 2/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0244 - recall: 0.6520 - f1_metric: 0.2748 - val_loss: 0.0273 - val_recall: 0.6773 - val_f1_metric: 0.3852 Epoch 3/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0220 - recall: 0.6830 - f1_metric: 0.2824 - val_loss: 0.0275 - val_recall: 0.6347 - val_f1_metric: 0.3625 Epoch 4/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0208 - recall: 0.6910 - f1_metric: 0.2863 - val_loss: 0.0285 - val_recall: 0.6160 - val_f1_metric: 0.3456 Epoch 5/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0195 - recall: 0.7160 - f1_metric: 0.2965 - val_loss: 0.0280 - val_recall: 0.5947 - val_f1_metric: 0.3383 Epoch 6/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0188 - recall: 0.7250 - f1_metric: 0.3052 - val_loss: 0.0272 - val_recall: 0.7547 - val_f1_metric: 0.4103 Epoch 7/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0177 - recall: 0.7420 - f1_metric: 0.3110 - val_loss: 0.0335 - val_recall: 0.5680 - val_f1_metric: 0.3249 Epoch 8/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0164 - recall: 0.7620 - f1_metric: 0.3136 - val_loss: 0.0283 - val_recall: 0.6880 - val_f1_metric: 0.3830 Epoch 9/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0155 - recall: 0.7740 - f1_metric: 0.3297 - val_loss: 0.0314 - val_recall: 0.6507 - val_f1_metric: 0.3647 Epoch 10/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0146 - recall: 0.7930 - f1_metric: 0.3327 - val_loss: 0.0324 - val_recall: 0.6773 - val_f1_metric: 0.3756 Epoch 11/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0139 - recall: 0.8030 - f1_metric: 0.3277 - val_loss: 0.0354 - val_recall: 0.6160 - val_f1_metric: 0.3499 Epoch 12/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0126 - recall: 0.8130 - f1_metric: 0.3306 - val_loss: 0.0350 - val_recall: 0.6453 - val_f1_metric: 0.3605 Epoch 13/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0123 - recall: 0.8310 - f1_metric: 0.3447 - val_loss: 0.0364 - val_recall: 0.6960 - val_f1_metric: 0.3823 Epoch 14/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0117 - recall: 0.8170 - f1_metric: 0.3441 - val_loss: 0.0378 - val_recall: 0.6267 - val_f1_metric: 0.3517 Epoch 15/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0112 - recall: 0.8300 - f1_metric: 0.3392 - val_loss: 0.0365 - val_recall: 0.7360 - val_f1_metric: 0.3979 Epoch 16/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0102 - recall: 0.8430 - f1_metric: 0.3554 - val_loss: 0.0393 - val_recall: 0.6880 - val_f1_metric: 0.3825 Epoch 17/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0102 - recall: 0.8490 - f1_metric: 0.3622 - val_loss: 0.0393 - val_recall: 0.7333 - val_f1_metric: 0.3954 Epoch 18/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0096 - recall: 0.8610 - f1_metric: 0.3534 - val_loss: 0.0419 - val_recall: 0.7173 - val_f1_metric: 0.3888 Epoch 19/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0094 - recall: 0.8690 - f1_metric: 0.3609 - val_loss: 0.0420 - val_recall: 0.6853 - val_f1_metric: 0.3773 Epoch 20/20 1875/1875 [==============================] - 3s 1ms/step - loss: 0.0089 - recall: 0.8700 - f1_metric: 0.3536 - val_loss: 0.0413 - val_recall: 0.8027 - val_f1_metric: 0.4238 500/500 [==============================] - 1s 909us/step [[ 301 74] [ 114 15511]] Precision: 0.725 Accuracy: 0.988 Recall: 0.803 F1: 0.762
import tensorflow as tf
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from tensorflow.keras import backend as K
def f1_metric(y_true, y_pred):
y_pred = K.round(y_pred)
precision = K.sum(y_true * y_pred) / (K.sum(y_pred) + K.epsilon())
recall = K.sum(y_true * y_pred) / (K.sum(y_true) + K.epsilon())
f1 = 2 * (precision * recall) / (precision + recall + K.epsilon())
return f1
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), f1_metric])
# Train the model
model.fit(X_resampled, y_resampled, epochs=20, batch_size=32, validation_data=(X_test, y_test))
# Make predictions on the test set
y_pred_prob = model.predict(X_test)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]
# Evaluate the quality of the results using your function
calcularQualitatResultats(y_test, y_pred)
WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead. WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\backend.py:873: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead. WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\optimizers\__init__.py:309: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead. Epoch 1/20 WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\utils\tf_utils.py:492: The name tf.ragged.RaggedTensorValue is deprecated. Please use tf.compat.v1.ragged.RaggedTensorValue instead. WARNING:tensorflow:From C:\Users\Ramon\anaconda3\Lib\site-packages\keras\src\engine\base_layer_utils.py:384: The name tf.executing_eagerly_outside_functions is deprecated. Please use tf.compat.v1.executing_eagerly_outside_functions instead. 313/313 [==============================] - 2s 4ms/step - loss: 0.1734 - recall: 0.9580 - f1_metric: 0.9442 - val_loss: 0.1179 - val_recall: 0.9733 - val_f1_metric: 0.3632 Epoch 2/20 313/313 [==============================] - 1s 3ms/step - loss: 0.1059 - recall: 0.9754 - f1_metric: 0.9674 - val_loss: 0.1317 - val_recall: 0.9840 - val_f1_metric: 0.3772 Epoch 3/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0904 - recall: 0.9800 - f1_metric: 0.9736 - val_loss: 0.1152 - val_recall: 0.9787 - val_f1_metric: 0.3921 Epoch 4/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0767 - recall: 0.9818 - f1_metric: 0.9780 - val_loss: 0.0979 - val_recall: 0.9653 - val_f1_metric: 0.4058 Epoch 5/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0668 - recall: 0.9848 - f1_metric: 0.9807 - val_loss: 0.1165 - val_recall: 0.9680 - val_f1_metric: 0.3923 Epoch 6/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0604 - recall: 0.9858 - f1_metric: 0.9825 - val_loss: 0.0753 - val_recall: 0.9360 - val_f1_metric: 0.4211 Epoch 7/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0568 - recall: 0.9870 - f1_metric: 0.9839 - val_loss: 0.0669 - val_recall: 0.9200 - val_f1_metric: 0.4210 Epoch 8/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0449 - recall: 0.9902 - f1_metric: 0.9884 - val_loss: 0.0661 - val_recall: 0.9093 - val_f1_metric: 0.4222 Epoch 9/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0418 - recall: 0.9900 - f1_metric: 0.9887 - val_loss: 0.1172 - val_recall: 0.9467 - val_f1_metric: 0.4030 Epoch 10/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0380 - recall: 0.9896 - f1_metric: 0.9897 - val_loss: 0.1209 - val_recall: 0.9493 - val_f1_metric: 0.3943 Epoch 11/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0329 - recall: 0.9912 - f1_metric: 0.9917 - val_loss: 0.1479 - val_recall: 0.9627 - val_f1_metric: 0.3951 Epoch 12/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0300 - recall: 0.9904 - f1_metric: 0.9913 - val_loss: 0.1436 - val_recall: 0.9387 - val_f1_metric: 0.3944 Epoch 13/20 313/313 [==============================] - 1s 4ms/step - loss: 0.0269 - recall: 0.9914 - f1_metric: 0.9923 - val_loss: 0.1675 - val_recall: 0.9573 - val_f1_metric: 0.3926 Epoch 14/20 313/313 [==============================] - 1s 4ms/step - loss: 0.0252 - recall: 0.9906 - f1_metric: 0.9923 - val_loss: 0.0990 - val_recall: 0.8987 - val_f1_metric: 0.4081 Epoch 15/20 313/313 [==============================] - 1s 4ms/step - loss: 0.0212 - recall: 0.9926 - f1_metric: 0.9936 - val_loss: 0.1406 - val_recall: 0.9307 - val_f1_metric: 0.4055 Epoch 16/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0196 - recall: 0.9930 - f1_metric: 0.9945 - val_loss: 0.1548 - val_recall: 0.9307 - val_f1_metric: 0.4011 Epoch 17/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0192 - recall: 0.9926 - f1_metric: 0.9942 - val_loss: 0.1182 - val_recall: 0.9040 - val_f1_metric: 0.4138 Epoch 18/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0211 - recall: 0.9904 - f1_metric: 0.9925 - val_loss: 0.1221 - val_recall: 0.9227 - val_f1_metric: 0.4012 Epoch 19/20 313/313 [==============================] - 1s 3ms/step - loss: 0.0174 - recall: 0.9938 - f1_metric: 0.9951 - val_loss: 0.1927 - val_recall: 0.9413 - val_f1_metric: 0.3965 Epoch 20/20 313/313 [==============================] - 1s 4ms/step - loss: 0.0166 - recall: 0.9936 - f1_metric: 0.9954 - val_loss: 0.1259 - val_recall: 0.8933 - val_f1_metric: 0.4174 500/500 [==============================] - 1s 1ms/step [[ 335 40] [ 312 15313]] Precision: 0.518 Accuracy: 0.978 Recall: 0.893 F1: 0.656
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
n_clusters = 2
X_test_array = X_test.values
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_test_array)
cluster_labels = kmeans.labels_
silhouette_avg = silhouette_score(X_test_array, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")
db_index = davies_bouldin_score(X_test_array, cluster_labels)
print(f"Davies–Bouldin Index: {db_index}")
plt.scatter(X_test_array[:, 0], X_test_array[:, 1], c=cluster_labels, cmap='viridis', marker='o', edgecolors='k', s=50)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.title('K-means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Silhouette Score: 0.340885971110426 Davies–Bouldin Index: 0.9948450984993457
from sklearn.metrics import adjusted_rand_score,normalized_mutual_info_score, homogeneity_score, completeness_score, v_measure_score
ari_score = adjusted_rand_score(y_test, cluster_labels)
print(f"Adjusted Rand Index: {ari_score}")
nmi_score = normalized_mutual_info_score(y_test, cluster_labels)
print(f"Normalized Mutual Information: {nmi_score}")
homogeneity = homogeneity_score(y_test, cluster_labels)
completeness = completeness_score(y_test, cluster_labels)
v_measure = v_measure_score(y_test, cluster_labels)
print(f"Homogeneity: {homogeneity}")
print(f"Completeness: {completeness}")
print(f"V-measure: {v_measure}")
Adjusted Rand Index: -0.022208038473670887 Normalized Mutual Information: 0.023753658721748883 Homogeneity: 0.07881608863354447 Completeness: 0.01398409910362346 V-measure: 0.023753658721748887
Els resultats són dolents. ARI: Un valor negatiu indica que les agrupacions són aleatòries. NMI: Al tenir un valor proper a 0 significa que el clustering i el ground truth són independents. Homogeneity: Un valor proper a 0 indica que els clústers no són homogenis Completeness: Al tenir un valor baix inidca que el clustering no ha pogut capturar completament les agruipacions originals. V-measure: És la mitjana harmònica entre Homogeneity i Completeness. Al tenir un valor baix indica un pobre rendiment.
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
dbscan_model = DBSCAN(eps=0.85, min_samples=30) # You can adjust eps and min_samples
dbscan_model.fit(X_train)
predictions_dbscan = dbscan_model.fit_predict(X_test)
X_test_array = X_test.values
plt.scatter(X_test_array[:, 0], X_test_array[:, 1], c=predictions_dbscan, cmap='viridis', marker='o', edgecolors='k', s=50)
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1 (Scaled)')
plt.ylabel('Feature 2 (Scaled)')
plt.show()
from sklearn.metrics import calinski_harabasz_score
silhouette_avg = silhouette_score(X_test, predictions_dbscan)
print(f"Silhouette Score: {silhouette_avg}")
davies_bouldin = davies_bouldin_score(X_test, predictions_dbscan)
print(f"Davies-Bouldin Index: {davies_bouldin}")
calinski_harabasz = calinski_harabasz_score(X_test, predictions_dbscan)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")
unique_clusters, cluster_counts = np.unique(predictions_dbscan, return_counts=True)
cluster_density = dict(zip(unique_clusters, cluster_counts))
print("Cluster Density:", cluster_density)
Silhouette Score: 0.008660676244886333
Davies-Bouldin Index: 1.6874120150000287
Calinski-Harabasz Index: 1885.685098754403
Cluster Density: {-1: 9856, 0: 4277, 1: 1655, 2: 131, 3: 50, 4: 31}
dbscan_model = DBSCAN(eps=0.85, min_samples=30) # You can adjust eps and min_samples
dbscan_model.fit(X_resampled)
predictions_dbscan = dbscan_model.fit_predict(X_test)
X_test_array = X_test.values
plt.scatter(X_test_array[:, 0], X_test_array[:, 1], c=predictions_dbscan, cmap='viridis', marker='o', edgecolors='k', s=50)
plt.title('DBSCAN Clustering')
plt.xlabel('Feature 1 (Scaled)')
plt.ylabel('Feature 2 (Scaled)')
plt.show()
from sklearn.metrics import calinski_harabasz_score
silhouette_avg = silhouette_score(X_test, predictions_dbscan)
print(f"Silhouette Score: {silhouette_avg}")
davies_bouldin = davies_bouldin_score(X_test, predictions_dbscan)
print(f"Davies-Bouldin Index: {davies_bouldin}")
calinski_harabasz = calinski_harabasz_score(X_test, predictions_dbscan)
print(f"Calinski-Harabasz Index: {calinski_harabasz}")
unique_clusters, cluster_counts = np.unique(predictions_dbscan, return_counts=True)
cluster_density = dict(zip(unique_clusters, cluster_counts))
print("Cluster Density:", cluster_density)
Silhouette Score: 0.008660676244886333
Davies-Bouldin Index: 1.6874120150000287
Calinski-Harabasz Index: 1885.685098754403
Cluster Density: {-1: 9856, 0: 4277, 1: 1655, 2: 131, 3: 50, 4: 31}
Silhouette Score: Aquesta puntuació indica quina és la cohesió i separació dels clústers. Tenint en compte que el rang és (-1,1), un valor proper a 0 el podriem considerar feble. Davies-Bouldin Index: Indica la comparsitat i l'espessor dels clústers. Un valor alt indicaria que els clústers són dolents. Calinski-Harabasz: També indica la cohesió i la separació dels clústers. En aquest cas, el valor és força alt i, per tant, positiu. Cluster Density: Indica el nombre de punts per clúster.
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
model_if = IsolationForest(contamination=0.04, random_state=seed_value)
#model_if.fit(X_train)
#predictions = model_if.predict(X_test)
#print("Predictions:", predictions)
#print(".............")
#predictions = np.where(predictions == -1, 0, predictions)
#predictions = 1-predictions
#calcularQualitatResultats(y_test, predictions)
predictions = np.where(y == 0, -1, y)
calcularValidacioCreuada(model_if, X[top_correlated_variables], predictions)
Scores de validació creuada (recall): [0.22957198 0.29482072 0.27272727 0.30078125 0.23184358] Mitjana dels scores (recall): 0.2659489599427078 Scores de validació creuada (f1): [0.00796598 0.00994691 0.00926672 0.01035225 0.01114468] Mitjana dels scores (f1): 0.009735305180520277
from sklearn.ensemble import IsolationForest
from sklearn.datasets import make_blobs
model_if_resampling = IsolationForest(contamination=0.04, random_state=seed_value)
model_if_resampling.fit(X_resampled)
#predictions = model_if_resampling.predict(X_test)
#print("Predictions:", predictions)
#print(".............")
#predictions = np.where(predictions == -1, 0, predictions)
#predictions = 1-predictions
#calcularQualitatResultats(y_test, predictions)
predictions = np.where(y_res == 0, -1, y_res)
calcularValidacioCreuada(model_if_resampling, X_res[top_correlated_variables], predictions)
Scores de validació creuada (recall): [0. 0. 0.93236364 0.89345455 0.89345455] Mitjana dels scores (recall): 0.5438545454545455 Scores de validació creuada (f1): [0. 0. 0.63686041 0.94372959 0.94372959] Mitjana dels scores (f1): 0.5048639193809472
Graduant el paràmetre "contamination" podem trobar un punt en el que tenim un bon recall i F1.
from sklearn.neighbors import LocalOutlierFactor
lof_model = LocalOutlierFactor(contamination='auto',novelty=True)
#lof_model.fit(X_train)
#predictions_lof = lof_model.fit_predict(X_test)
#predictions_lof_mapped = np.where(predictions_lof == -1, 0, 1)
#print("LOF Predictions:", predictions_lof_mapped)
#calcularQualitatResultats(y_test, predictions_lof_mapped)
predictions_train_mapped = np.where(y_train == 0, -1, 1)
calcularValidacioCreuada(lof_model, X_train[top_correlated_variables], predictions_train_mapped)
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
Scores de validació creuada (recall): [0.84649123 0.8372093 0.82474227 0.89673913 0.82681564] Mitjana dels scores (recall): 0.8463995142659755
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
Scores de validació creuada (f1): [0.0330791 0.03089864 0.02751032 0.02821236 0.0253794 ] Mitjana dels scores (f1): 0.02901596344898318
lof_model = LocalOutlierFactor(contamination='auto')
lof_model.fit(X_train)
predictions_lof = lof_model.fit_predict(X_test)
predictions_lof_mapped = np.where(predictions_lof == -1, 0, 1)
X, _ = make_blobs(n_samples=16000, centers=1, random_state=42)
plt.scatter(X[predictions_lof_mapped == 1, 0], X[predictions_lof_mapped == 1, 1], c='blue', label='Outliers', edgecolors='k', s=50)
plt.scatter(X[predictions_lof_mapped == 0, 0], X[predictions_lof_mapped == 0, 1], c='red', label='Inliers', edgecolors='k', s=50)
plt.title("One-Class SVM Outlier Detection")
plt.legend()
plt.show()
from sklearn.neighbors import LocalOutlierFactor
lof_model_resampling = LocalOutlierFactor(contamination='auto', novelty=True)
#lof_model_resampling.fit(X_resampled)
#predictions_lof_resampling = lof_model_resampling.fit_predict(X_test)
#predictions_lof_mapped_resampling = np.where(predictions_lof_resampling == -1, 0, 1)
#print("LOF Predictions:", predictions_lof_mapped_resampling)
#calcularQualitatResultats(y_test, predictions_lof_mapped_resampling)
predictions_train_mapped = np.where(y_resampled == 0, -1, 1)
calcularValidacioCreuada(lof_model_resampling, X_resampled[top_correlated_variables], predictions_train_mapped)
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
Scores de validació creuada (recall): [0. 0. 0.88 0.8605 0.889 ] Mitjana dels scores (recall): 0.5259
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
Scores de validació creuada (f1): [0. 0. 0.63791229 0.92502016 0.94123875] Mitjana dels scores (f1): 0.5008342387188665
C:\Users\Ramon\anaconda3\Lib\site-packages\sklearn\base.py:464: UserWarning: X does not have valid feature names, but LocalOutlierFactor was fitted with feature names warnings.warn(
Veiem que independentment del valor de "contamination" sempre obtenim molt bon recall i molt mal F1. Això és perquè estem detectan molts falsos negatius. I si provem d'utilitzar només les variables més correlacionades o amb PCA obtenim resultats igual de dolents.
from sklearn.svm import OneClassSVM
svm_model = OneClassSVM(nu=0.5) # Adjust the nu parameter based on your dataset
#svm_model.fit(X_train)
#predictions_svm = svm_model.predict(X_test)
#predictions_svm_mapped = np.where(predictions_svm == -1, 0, 1)
#print("One-Class SVM Predictions:", predictions_svm_mapped)
#calcularQualitatResultats(y_test, predictions_svm_mapped)
predictions_train_mapped = np.where(y == 0, -1, 1)
calcularValidacioCreuada(svm_model, X[top_correlated_variables], predictions_train_mapped)
Scores de validació creuada (recall): [0.0077821 0.01992032 0.01185771 0.01171875 0. ] Mitjana dels scores (recall): 0.01025577548045924 Scores de validació creuada (f1): [0.0005176 0.00126534 0.00079418 0.0007486 0. ] Mitjana dels scores (f1): 0.0006651426085821843
svm_model_tmp = OneClassSVM(nu=0.5) # Adjust the nu parameter based on your dataset
svm_model_tmp.fit(X_train)
predictions_svm = svm_model_tmp.predict(X_test)
predictions_svm_mapped = np.where(predictions_svm == -1, 0, 1)
X, _ = make_blobs(n_samples=16000, centers=1, random_state=42)
plt.scatter(X[predictions_svm_mapped == 1, 0], X[predictions_svm_mapped == 1, 1], c='blue', label='Outliers', edgecolors='k', s=50)
plt.scatter(X[predictions_svm_mapped == 0, 0], X[predictions_svm_mapped == 0, 1], c='red', label='Inliers', edgecolors='k', s=50)
plt.title("One-Class SVM Outlier Detection")
plt.legend()
plt.show()
from sklearn.svm import OneClassSVM
svm_model = OneClassSVM(nu=0.5) # Adjust the nu parameter based on your dataset
#svm_model.fit(X_resampled)
#predictions_svm = svm_model.pred
ict(X_test)
#predictions_svm_mapped = np.where(predictions_svm == -1, 0, 1)
#print("One-Class SVM Predictions:", predictions_svm_mapped)
#calcularQualitatResultats(y_test, predictions_svm_mapped)
predictions_train_mapped = np.where(y_res == 0, -1, 1)
calcularValidacioCreuada(svm_model, X_res[top_correlated_variables], predictions_train_mapped)
Scores de validació creuada (recall): [0. 0. 0.28290909 0.11745455 0.12472727] Mitjana dels scores (recall): 0.10501818181818182 Scores de validació creuada (f1): [0. 0. 0.28106936 0.21021803 0.22179114] Mitjana dels scores (f1): 0.14261570668686158
svm_model_resampling_tmp = OneClassSVM(nu=0.5) # Adjust the nu parameter based on your dataset
svm_model_resampling_tmp.fit(X_resampled)
predictions_svm = svm_model_resampling_tmp.predict(X_test)
predictions_svm_mapped = np.where(predictions_svm == -1, 0, 1)
X, _ = make_blobs(n_samples=16000, centers=1, random_state=42)
plt.scatter(X[predictions_svm_mapped == 1, 0], X[predictions_svm_mapped == 1, 1], c='blue', label='Outliers', edgecolors='k', s=50)
plt.scatter(X[predictions_svm_mapped == 0, 0], X[predictions_svm_mapped == 0, 1], c='red', label='Inliers', edgecolors='k', s=50)
plt.title("One-Class SVM Outlier Detection")
plt.legend()
plt.show()
from tensorflow import keras
input_dim = X_train.shape[1]
encoding_dim = 32
mean_train = np.mean(X_train, axis=0)
std_train = np.std(X_train, axis=0)
X_train_normalized = (X_train - mean_train) / std_train
X_test_normalized = (X_test - mean_train) / std_train
autoencoder = keras.Sequential([
keras.layers.InputLayer(input_shape=(input_dim,)),
keras.layers.Dense(encoding_dim, activation='relu'),
keras.layers.Dense(input_dim, activation='sigmoid')
])
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_normalized, X_train_normalized, epochs=20, batch_size=64, shuffle=True, validation_data=(X_test_normalized, X_test_normalized))
reconstructed_instances = autoencoder.predict(X_test)
mse = np.mean(np.square(X_test_normalized - reconstructed_instances), axis=1)
threshold = np.percentile(mse, 94)
predictions_autoencoder = (mse > threshold).astype(int)
print("Autoencoder Predictions:", predictions_autoencoder)
calcularQualitatResultats(y_test, predictions_autoencoder)
Epoch 1/20
938/938 [==============================] - 2s 2ms/step - loss: 0.7436 - val_loss: 0.7069
Epoch 2/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6777 - val_loss: 0.6897
Epoch 3/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6665 - val_loss: 0.6819
Epoch 4/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6610 - val_loss: 0.6777
Epoch 5/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6578 - val_loss: 0.6748
Epoch 6/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6557 - val_loss: 0.6726
Epoch 7/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6540 - val_loss: 0.6707
Epoch 8/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6525 - val_loss: 0.6694
Epoch 9/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6515 - val_loss: 0.6687
Epoch 10/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6508 - val_loss: 0.6682
Epoch 11/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6503 - val_loss: 0.6677
Epoch 12/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6499 - val_loss: 0.6674
Epoch 13/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6495 - val_loss: 0.6671
Epoch 14/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6492 - val_loss: 0.6667
Epoch 15/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6489 - val_loss: 0.6663
Epoch 16/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6480 - val_loss: 0.6652
Epoch 17/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6477 - val_loss: 0.6651
Epoch 18/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6475 - val_loss: 0.6649
Epoch 19/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6474 - val_loss: 0.6645
Epoch 20/20
938/938 [==============================] - 2s 2ms/step - loss: 0.6472 - val_loss: 0.6647
500/500 [==============================] - 1s 1ms/step
Autoencoder Predictions: 0 0
1 0
2 0
3 0
4 0
..
15995 0
15996 0
15997 0
15998 0
15999 0
Length: 16000, dtype: int32
[[ 325 50]
[ 635 14990]]
Precision: 0.339
Accuracy: 0.957
Recall: 0.867
F1: 0.487
X_test_array = X_test_normalized.to_numpy()
# Plotting
n_samples = min(3, X_test_array.shape[0]) # Choose the number of samples to visualize (e.g., 5)
plt.figure(figsize=(8, 4 * n_samples))
for i in range(n_samples):
# Original Data
plt.subplot(n_samples, 2, 2 * i + 1)
plt.scatter(range(X_test_array.shape[1]), X_test_array[i], c='blue', marker='o', label='Original')
plt.title(f'Original Data - Sample {i + 1}')
plt.xlabel('Feature Index')
plt.ylabel('Feature Value')
# Reconstructed Data
plt.subplot(n_samples, 2, 2 * i + 2)
plt.scatter(range(X_test_array.shape[1]), reconstructed_instances[i], c='red', marker='x', label='Reconstructed')
plt.title(f'Reconstructed Data - Sample {i + 1}')
plt.xlabel('Feature Index')
plt.ylabel('Feature Value')
plt.tight_layout()
plt.show()
from tensorflow import keras
input_dim = X_train.shape[1]
encoding_dim = 32
mean_train = np.mean(X_resampled, axis=0)
std_train = np.std(X_resampled, axis=0)
X_train_normalized = (X_resampled - mean_train) / std_train
X_test_normalized = (X_test - mean_train) / std_train
autoencoder = keras.Sequential([
keras.layers.InputLayer(input_shape=(input_dim,)),
keras.layers.Dense(encoding_dim, activation='relu'),
keras.layers.Dense(input_dim, activation='sigmoid')
])
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train_normalized, X_train_normalized, epochs=20, batch_size=64, shuffle=True, validation_data=(X_test_normalized, X_test_normalized))
reconstructed_instances = autoencoder.predict(X_test)
mse = np.mean(np.square(X_test_normalized - reconstructed_instances), axis=1)
threshold = np.percentile(mse, 90)
predictions_autoencoder = (mse > threshold).astype(int)
print("Autoencoder Predictions:", predictions_autoencoder)
calcularQualitatResultats(y_test, predictions_autoencoder)
Epoch 1/20
157/157 [==============================] - 1s 5ms/step - loss: 0.8482 - val_loss: 0.6127
Epoch 2/20
157/157 [==============================] - 1s 3ms/step - loss: 0.6859 - val_loss: 0.5966
Epoch 3/20
157/157 [==============================] - 1s 4ms/step - loss: 0.6480 - val_loss: 0.5893
Epoch 4/20
157/157 [==============================] - 1s 3ms/step - loss: 0.6285 - val_loss: 0.5842
Epoch 5/20
157/157 [==============================] - 1s 3ms/step - loss: 0.6167 - val_loss: 0.5801
Epoch 6/20
157/157 [==============================] - 1s 3ms/step - loss: 0.6079 - val_loss: 0.5768
Epoch 7/20
157/157 [==============================] - 1s 3ms/step - loss: 0.6012 - val_loss: 0.5739
Epoch 8/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5957 - val_loss: 0.5714
Epoch 9/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5914 - val_loss: 0.5696
Epoch 10/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5877 - val_loss: 0.5680
Epoch 11/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5845 - val_loss: 0.5668
Epoch 12/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5819 - val_loss: 0.5657
Epoch 13/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5796 - val_loss: 0.5648
Epoch 14/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5775 - val_loss: 0.5639
Epoch 15/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5756 - val_loss: 0.5633
Epoch 16/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5740 - val_loss: 0.5627
Epoch 17/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5725 - val_loss: 0.5620
Epoch 18/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5711 - val_loss: 0.5614
Epoch 19/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5699 - val_loss: 0.5610
Epoch 20/20
157/157 [==============================] - 1s 3ms/step - loss: 0.5687 - val_loss: 0.5605
500/500 [==============================] - 1s 1ms/step
Autoencoder Predictions: 0 0
1 0
2 1
3 0
4 0
..
15995 0
15996 1
15997 0
15998 0
15999 0
Length: 16000, dtype: int32
[[ 114 261]
[ 1486 14139]]
Precision: 0.071
Accuracy: 0.891
Recall: 0.304
F1: 0.115
X_test_array = X_test_normalized.to_numpy()
# Plotting
n_samples = min(3, X_test_array.shape[0]) # Choose the number of samples to visualize (e.g., 5)
plt.figure(figsize=(8, 4 * n_samples))
for i in range(n_samples):
# Original Data
plt.subplot(n_samples, 2, 2 * i + 1)
plt.scatter(range(X_test_array.shape[1]), X_test_array[i], c='blue', marker='o', label='Original')
plt.title(f'Original Data - Sample {i + 1}')
plt.xlabel('Feature Index')
plt.ylabel('Feature Value')
# Reconstructed Data
plt.subplot(n_samples, 2, 2 * i + 2)
plt.scatter(range(X_test_array.shape[1]), reconstructed_instances[i], c='red', marker='x', label='Reconstructed')
plt.title(f'Reconstructed Data - Sample {i + 1}')
plt.xlabel('Feature Index')
plt.ylabel('Feature Value')
plt.tight_layout()
plt.show()
from pyod.models.hbos import HBOS
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
hbos_model = HBOS(contamination=0.06) # You can adjust the contamination parameter
calcularValidacioCreuada(hbos_model, X[top_correlated_variables], y)
C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn(
Scores de validació creuada (recall): [0.85992218 0.80079681 0.81422925 0.81640625 0.84078212] Mitjana dels scores (recall): 0.8264273227308433 Scores de validació creuada (f1): [0.35964199 0.35989257 0.35982533 0.36828194 0.458492 ] Mitjana dels scores (f1): 0.3812267647239174
C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn(
hbos_model_tmp = HBOS(contamination=0.06) # You can adjust the contamination parameter
hbos_model_tmp.fit(X_train)
predictions_hbos = hbos_model_tmp.predict(X_test)
classification_rep = classification_report(y_test, predictions_hbos)
print("Classification Report:")
print(classification_rep)
calcularQualitatResultats(y_test, predictions_hbos)
anomaly_scores = hbos_model.decision_function(X_test)
plt.figure(figsize=(8, 4))
plt.scatter(range(len(anomaly_scores)), anomaly_scores, c='red', marker='o', edgecolors='k', s=50)
plt.axhline(y=0, color='blue', linestyle='--', label='Decision Boundary')
plt.title('HBOS Anomaly Scores')
plt.xlabel('Data Point Index')
plt.ylabel('Anomaly Score')
plt.legend()
plt.show()
Classification Report:
precision recall f1-score support
0 0.99 0.96 0.97 15625
1 0.27 0.67 0.38 375
accuracy 0.95 16000
macro avg 0.63 0.82 0.68 16000
weighted avg 0.97 0.95 0.96 16000
[[ 253 122]
[ 696 14929]]
Precision: 0.267
Accuracy: 0.949
Recall: 0.675
F1: 0.382
from pyod.models.hbos import HBOS
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
hbos_model_resampling = HBOS(contamination=0.5) # You can adjust the contamination parameter
calcularValidacioCreuada(hbos_model_resampling, X_res[top_correlated_variables], y_res)
Scores de validació creuada (recall): [0. 0. 0.78690909 0.93745455 0.92072727] Mitjana dels scores (recall): 0.5290181818181818 Scores de validació creuada (f1): [0. 0. 0.78805535 0.96771772 0.95872775] Mitjana dels scores (f1): 0.5429001651194381
C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn( C:\Users\Ramon\anaconda3\Lib\site-packages\pyod\models\base.py:430: UserWarning: y should not be presented in unsupervised learning. warnings.warn(
hbos_model_tmp = HBOS(contamination=0.5) # You can adjust the contamination parameter
hbos_model_tmp.fit(X_resampled)
predictions_hbos = hbos_model_tmp.predict(X_test)
classification_rep = classification_report(y_test, predictions_hbos)
print("Classification Report:")
print(classification_rep)
calcularQualitatResultats(y_test, predictions_hbos)
anomaly_scores = hbos_model.decision_function(X_test)
plt.figure(figsize=(8, 4))
plt.scatter(range(len(anomaly_scores)), anomaly_scores, c='red', marker='o', edgecolors='k', s=50)
plt.axhline(y=0, color='blue', linestyle='--', label='Decision Boundary')
plt.title('HBOS Anomaly Scores')
plt.xlabel('Data Point Index')
plt.ylabel('Anomaly Score')
plt.legend()
plt.show()
Classification Report:
precision recall f1-score support
0 0.98 0.56 0.71 15625
1 0.03 0.51 0.05 375
accuracy 0.56 16000
macro avg 0.50 0.53 0.38 16000
weighted avg 0.96 0.56 0.70 16000
[[ 191 184]
[6899 8726]]
Precision: 0.027
Accuracy: 0.557
Recall: 0.509
F1: 0.051